feat: add proxy service for GeeksforGeeks articles

Set up initial project structure and dependencies for Ducks for Ducks proxy service.

- Added .gitignore to exclude virtual environments, __pycache__, and build directories.
- Included MIT License for the project.
- Created README.md for project documentation.
- Configured pyproject.toml to use Hatchling as build system, specified project metadata and dependencies.
- Implemented Flask app for proxying GeeksforGeeks articles:
  - Routes to serve static files, index page, proxy images, and render articles.
  - Function to fetch and clean article content.
- Added custom CSS for styling differences between internal and external links.
- Included Bootstrap CSS for base styling.
- Created HTML templates for base layout, index, and article pages.

This change sets up the core functionality of proxying GeeksforGeeks content through the Flask application.
This commit is contained in:
Kumi 2024-08-27 16:38:13 +02:00
commit 5f710d8a31
Signed by: kumi
GPG key ID: ECBCC9082395383F
11 changed files with 277 additions and 0 deletions

5
.gitignore vendored Normal file
View file

@ -0,0 +1,5 @@
venv/
.venv/
__pycache__/
*.pyc
/dist/

19
LICENSE Normal file
View file

@ -0,0 +1,19 @@
Copyright (c) 2024 Private.coffee Team <support@private.coffee>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

0
README.md Normal file
View file

26
pyproject.toml Normal file
View file

@ -0,0 +1,26 @@
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[project]
name = "ducksforducks"
version = "0.1.0"
authors = [{ name = "Private.coffee Team", email = "support@private.coffee" }]
description = "A simple frontend for GeeksforGeeks"
readme = "README.md"
license = { file = "LICENSE" }
requires-python = ">=3.10"
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
]
dependencies = ["flask", "bs4"]
[project.scripts]
ducksforducks = "ducksforducks.app:main"
[project.urls]
"Homepage" = "https://git.private.coffee/privatecoffee/ducksforducks"
"Bug Tracker" = "https://git.private.coffee/privatecoffee/ducksforducks/issues"
"Source Code" = "https://git.private.coffee/privatecoffee/ducksforducks"

View file

134
src/ducksforducks/app.py Normal file
View file

@ -0,0 +1,134 @@
from flask import (
Flask,
request,
send_from_directory,
render_template,
Response,
)
import os
from bs4 import BeautifulSoup
import logging
import urllib
from typing import Text
app = Flask(__name__)
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG)
logger.addHandler(handler)
# Remove the default Flask logger
app.logger.removeHandler(app.logger.handlers[0])
@app.route("/static/<path:path>")
def static_files(path: str) -> Response:
"""Serves static files.
Args:
path (str): The path to the static file.
Returns:
Response: The static file.
"""
return send_from_directory("static", path)
@app.route("/")
def index() -> Text:
"""Renders the index page.
Returns:
Text: The rendered index page.
"""
host = request.host
return render_template("index.html", host=host)
@app.route("/proxy")
def proxy() -> bytes:
"""A simple proxy for images.
Returns:
bytes: The content of the proxied URL.
"""
url = request.args.get("url")
if not url or not (url.startswith("https://media.geeksforgeeks.org/")):
logger.error(f"Invalid URL for proxying: {url}")
return "Invalid URL"
logger.debug(f"Proxying {url}")
with urllib.request.urlopen(url) as response:
data = response.read()
return data
@app.route("/<path:path>")
def article_page(path):
"""Renders the article page.
Args:
path (str): The path to the article.
Returns:
Text: The rendered article page.
"""
response = urllib.request.urlopen(f"https://www.geeksforgeeks.org/{path}")
if response.getcode() == 200:
soup = BeautifulSoup(response, "html.parser")
content = get_content(soup)
title = content.find(class_="article-title").text
return render_template("article.html", content=content, title=title)
else:
return (
render_template("error.html", code=response.getcode()),
response.getcode(),
)
def get_content(soup: BeautifulSoup) -> BeautifulSoup:
article_content = soup.find("div", {"class": "a-wrapper"}).find("article")
for img in article_content.find_all("img"):
img["src"] = f"/proxy?url={img['src']}"
for element in article_content.find_all(["script", "style"]):
element.decompose()
for ad in article_content.find_all():
if ad.get("id") and ad["id"].startswith("GFG_AD_"):
ad.decompose()
for link in article_content.find_all("a"):
if link.get("href") and link["href"].startswith("https://www.geeksforgeeks.org/"):
if not link["href"].startswith("https://www.geeksforgeeks.org/user/"):
link["href"] = f"/{link['href'].replace('https://www.geeksforgeeks.org/', '')}"
else:
classes = link.get("class", [])
classes.append("gfg-link")
link["class"] = classes
else:
classes = link.get("class", [])
classes.append("external-link")
link["class"] = classes
return article_content
def main():
port = int(os.getenv("PORT", 8113))
debug = bool(os.getenv("DEBUG", False))
app.run(port=port, debug=debug)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,23 @@
.onopen-discussion-panel,
.article_bottom_suggestion_wrapper,
.article-meta-author-details-follow-button,
.three_dot_dropdown,
#myDropdown {
display: none;
}
a.gfg-link::after {
content: " (opens on geeksforgeeks.org)";
font-size: 0.8em;
color: #888;
}
a.external-link::after {
content: " (external link)";
font-size: 0.8em;
color: #888;
}
.text {
margin-top: 2em;
}

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,10 @@
{% extends "base.html" %} {% block title %}{{ title }} - Ducks for Ducks {%
endblock %} {% block content %}
<div class="row">
<div class="col-md-9">
<article>
{{ content | safe }}
</article>
</div>
</div>
{% endblock %}

View file

@ -0,0 +1,36 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>{% block title %}Ducks for Ducks{% endblock %}</title>
<link
href="{{ url_for('static', filename='dist/css/bootstrap.min.css') }}"
rel="stylesheet"
/>
<link
rel="stylesheet"
href="{{ url_for('static', filename='css/style.css') }}"
/>
</head>
<body>
<nav class="navbar navbar-dark bg-dark">
<div class="container">
<a class="navbar-brand" href="{{ url_for('index') }}"
>Ducks for Ducks</a
>
</div>
</nav>
<div class="container mt-4">{% block content %}{% endblock %}</div>
<footer class="footer mt-4 py-3 bg-dark text-light">
<div class="container">
<span class="text-muted"
>Ducks for Ducks is brought to you by
<a href="https://git.private.coffee/PrivateCoffee/ducksforducks"
>Private.coffee</a
>
</span>
</div>
</footer>
</body>
</html>

View file

@ -0,0 +1,17 @@
{% extends "base.html" %}
{% block title %}Ducks for Ducks - Home{% endblock %}
{% block content %}
<div class="jumbotron">
<h1 class="display-4">Welcome to Ducks for Ducks!</h1>
<p class="lead">Ducks for Ducks is a simple proxy service to geeksforgeeks.org</p>
<hr class="my-4">
<h3>Usage</h3>
<p>To use this service, simply replace <code>geeksforgeeks.org</code> with <code>{{ host }}</code> in the site URL, like this:</p>
<a href="https://{{ host }}/the-fox-the-duck-and-a-circular-pond/"><code>https://{{ host }}/the-fox-the-duck-and-a-circular-pond/</code></a>
<hr class="my-4">
<h3>Work in progress</h3>
<p>Fetching the content of Geeks for Geeks' home page is still in the works. Also, user profiles and other features are Coming Soon&trade;.</p>
</div>
{% endblock %}