feat: enhance content extraction and documentation
Added a docstring to the get_content function to improve code documentation and clarify its intent. Enhanced URL processing logic for article links to ensure more readable and consistent URLs within the application. Included a docstring for the main function to outline its purpose. Improves code maintainability and readability.
This commit is contained in:
parent
5f710d8a31
commit
51e31fc8fe
1 changed files with 15 additions and 2 deletions
|
@ -94,6 +94,14 @@ def article_page(path):
|
|||
|
||||
|
||||
def get_content(soup: BeautifulSoup) -> BeautifulSoup:
|
||||
"""Extracts the article content from the soup.
|
||||
|
||||
Args:
|
||||
soup (BeautifulSoup): The soup of the article page.
|
||||
|
||||
Returns:
|
||||
BeautifulSoup: The article content.
|
||||
"""
|
||||
article_content = soup.find("div", {"class": "a-wrapper"}).find("article")
|
||||
|
||||
for img in article_content.find_all("img"):
|
||||
|
@ -107,9 +115,13 @@ def get_content(soup: BeautifulSoup) -> BeautifulSoup:
|
|||
ad.decompose()
|
||||
|
||||
for link in article_content.find_all("a"):
|
||||
if link.get("href") and link["href"].startswith("https://www.geeksforgeeks.org/"):
|
||||
if link.get("href") and link["href"].startswith(
|
||||
"https://www.geeksforgeeks.org/"
|
||||
):
|
||||
if not link["href"].startswith("https://www.geeksforgeeks.org/user/"):
|
||||
link["href"] = f"/{link['href'].replace('https://www.geeksforgeeks.org/', '')}"
|
||||
link["href"] = (
|
||||
f"/{link['href'].replace('https://www.geeksforgeeks.org/', '')}"
|
||||
)
|
||||
|
||||
else:
|
||||
classes = link.get("class", [])
|
||||
|
@ -125,6 +137,7 @@ def get_content(soup: BeautifulSoup) -> BeautifulSoup:
|
|||
|
||||
|
||||
def main():
|
||||
"""Runs the app."""
|
||||
port = int(os.getenv("PORT", 8113))
|
||||
debug = bool(os.getenv("DEBUG", False))
|
||||
app.run(port=port, debug=debug)
|
||||
|
|
Loading…
Reference in a new issue