feat: dynamic Wikimedia projects and logging enhancements

Implemented dynamic fetching of Wikimedia projects and languages from the Wikimedia API, replacing static definitions. Enhanced logging with a detailed and formatted log output for better debugging and monitoring. Updated templates to list projects and languages dynamically. Improved URL handling and proxying for robust performance. This change ensures up-to-date project information and enhances overall application transparency and maintainability.
This commit is contained in:
Kumi 2024-07-16 08:24:40 +02:00
parent fcb6a4aa96
commit d4c60d98d9
Signed by: kumi
GPG key ID: ECBCC9082395383F
2 changed files with 122 additions and 31 deletions

137
app.py
View file

@ -1,26 +1,63 @@
from flask import Flask, render_template, request, redirect, url_for from flask import Flask, render_template, request, redirect, url_for
import urllib.request import urllib.request
from urllib.parse import urlencode from urllib.parse import urlencode, urlparse
from html import escape from html import escape
import json import json
import logging
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
app = Flask(__name__) app = Flask(__name__)
WIKIMEDIA_PROJECTS = { logging.basicConfig(level=logging.DEBUG)
"wikipedia": "wikipedia.org",
"wiktionary": "wiktionary.org", logger = logging.getLogger(__name__)
# TODO: Add more Wikimedia projects
handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG)
logger.addHandler(handler)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
def get_wikimedia_projects():
url = "https://meta.wikimedia.org/w/api.php?action=sitematrix&format=json"
with urllib.request.urlopen(url) as response:
data = json.loads(response.read().decode())
projects = {}
languages = {}
for key, value in data["sitematrix"].items():
if key.isdigit():
language = value["name"]
language_code = value["code"]
language_projects = {}
for site in value["site"]:
language_projects[site["code"]] = site["url"]
if language_code == "en":
projects[site["code"]] = site["sitename"]
if language_projects:
languages[language_code] = {
"projects": language_projects,
"name": language,
} }
return projects, languages
def get_proxy_url(url): def get_proxy_url(url):
if url.startswith("//"): if url.startswith("//"):
url = "https:" + url url = "https:" + url
if not url.startswith("https://upload.wikimedia.org/"): if not url.startswith("https://upload.wikimedia.org/"):
logger.debug(f"Not generating proxy URL for {url}")
return url return url
logger.debug(f"Generating proxy URL for {url}")
return f"/proxy?{urlencode({'url': url})}" return f"/proxy?{urlencode({'url': url})}"
@ -29,8 +66,11 @@ def proxy():
url = request.args.get("url") url = request.args.get("url")
if not url or not url.startswith("https://upload.wikimedia.org/"): if not url or not url.startswith("https://upload.wikimedia.org/"):
logger.error(f"Invalid URL for proxying: {url}")
return "Invalid URL" return "Invalid URL"
logger.debug(f"Proxying {url}")
with urllib.request.urlopen(url) as response: with urllib.request.urlopen(url) as response:
data = response.read() data = response.read()
return data return data
@ -38,7 +78,11 @@ def proxy():
@app.route("/") @app.route("/")
def home(): def home():
return render_template("home.html") return render_template(
"home.html",
wikimedia_projects=app.wikimedia_projects,
languages=app.languages,
)
@app.route("/search", methods=["GET", "POST"]) @app.route("/search", methods=["GET", "POST"])
@ -50,35 +94,58 @@ def search():
return redirect( return redirect(
url_for("search_results", project=project, lang=lang, query=query) url_for("search_results", project=project, lang=lang, query=query)
) )
return render_template("search.html") return render_template(
"search.html",
wikimedia_projects=app.wikimedia_projects,
languages=app.languages,
)
@app.route("/<project>/<lang>/wiki/<title>") @app.route("/<project>/<lang>/wiki/<title>")
def wiki_article(project, lang, title): def wiki_article(project, lang, title):
base_url = WIKIMEDIA_PROJECTS.get(project, "wikipedia.org") language_projects = app.languages.get(lang, {}).get("projects", {})
url = f"https://{lang}.{base_url}/w/api.php?action=query&format=json&titles={escape(title.replace(" ", "_"), True)}&prop=revisions&rvprop=content&rvparse=1" base_url = language_projects.get(project)
if not base_url:
return "Invalid language or project"
logger.debug(f"Fetching {title} from {base_url}")
url = f"{base_url}/w/api.php?action=query&format=json&titles={escape(title.replace(" ", "_"), True)}&prop=revisions&rvprop=content&rvparse=1"
with urllib.request.urlopen(url) as response: with urllib.request.urlopen(url) as response:
data = json.loads(response.read().decode()) data = json.loads(response.read().decode())
pages = data["query"]["pages"] pages = data["query"]["pages"]
article_html = next(iter(pages.values()))["revisions"][0]["*"] article_html = next(iter(pages.values()))["revisions"][0]["*"]
soup = BeautifulSoup(article_html, "html.parser") soup = BeautifulSoup(article_html, "html.parser")
for a in soup.find_all("a", href=True): for a in soup.find_all("a", href=True):
href = a["href"] href = a["href"]
if href.startswith("/wiki/"): if href.startswith("/wiki/"):
a["href"] = f"/{project}/{lang}{href}" a["href"] = f"/{project}/{lang}{href}"
elif href.startswith("//") or href.startswith("https://"): elif href.startswith("//") or href.startswith("https://"):
parts = href.split("/") parts = urlparse(href)
if len(parts) > 4:
target_project = ".".join(parts[2].split(".")[1:]) target_domain = parts.netloc
target_lang = parts[2].split(".")[0] path_parts = parts.path.split("/")
target_title = "/".join(parts[4:])
if target_project in WIKIMEDIA_PROJECTS.values(): if len(path_parts) > 4:
target_project = list(WIKIMEDIA_PROJECTS.keys())[ target_title = "/".join(path_parts[4:])
list(WIKIMEDIA_PROJECTS.values()).index(target_project) target_lang = target_domain.split(".")[0]
]
a["href"] = f"/{target_project}/{target_lang}/wiki/{target_title}" found = False
for language, language_projects in app.languages.items():
for project_name, project_url in language_projects.items():
if target_domain == project_url:
a["href"] = (
f"/{project_name}/{target_lang}/wiki/{target_title}"
)
found = True
break
if found:
break
for span in soup.find_all("span", class_="mw-editsection"): for span in soup.find_all("span", class_="mw-editsection"):
span.decompose() span.decompose()
@ -89,19 +156,37 @@ def wiki_article(project, lang, title):
for img in soup.find_all("img"): for img in soup.find_all("img"):
img["src"] = get_proxy_url(img["src"]) img["src"] = get_proxy_url(img["src"])
for source in soup.find_all("source"):
source["src"] = get_proxy_url(source["src"])
for video in soup.find_all("video"):
video["poster"] = get_proxy_url(video["poster"])
for li in soup.find_all("li"): for li in soup.find_all("li"):
# If "nv-view", "nv-talk", "nv-edit" classes are on the li element, remove it
if any(cls in li.get("class", []) for cls in ["nv-view", "nv-talk", "nv-edit"]): if any(cls in li.get("class", []) for cls in ["nv-view", "nv-talk", "nv-edit"]):
li.decompose() li.decompose()
processed_html = str(soup) processed_html = str(soup)
return render_template("article.html", title=title, content=processed_html) return render_template(
"article.html",
title=title,
content=processed_html,
wikimedia_projects=app.wikimedia_projects,
languages=app.languages,
)
@app.route("/<project>/<lang>/search/<query>") @app.route("/<project>/<lang>/search/<query>")
def search_results(project, lang, query): def search_results(project, lang, query):
base_url = WIKIMEDIA_PROJECTS.get(project, "wikipedia.org") language_projects = app.languages.get(lang, {}).get("projects", {})
url = f"https://{lang}.{base_url}/w/api.php?action=query&format=json&list=search&srsearch={query}" base_url = language_projects.get(project)
if not base_url:
return "Invalid language or project"
logger.debug(f"Searching {base_url} for {query}")
url = f"{base_url}/w/api.php?action=query&format=json&list=search&srsearch={query}"
with urllib.request.urlopen(url) as response: with urllib.request.urlopen(url) as response:
data = json.loads(response.read().decode()) data = json.loads(response.read().decode())
search_results = data["query"]["search"] search_results = data["query"]["search"]
@ -111,6 +196,8 @@ def search_results(project, lang, query):
search_results=search_results, search_results=search_results,
project=project, project=project,
lang=lang, lang=lang,
wikimedia_projects=app.wikimedia_projects,
languages=app.languages,
) )
@ -119,5 +206,9 @@ def search_redirect(project, lang, query):
return redirect(url_for("search_results", project=project, lang=lang, query=query)) return redirect(url_for("search_results", project=project, lang=lang, query=query))
app.wikimedia_projects, app.languages = get_wikimedia_projects()
print(len(app.wikimedia_projects), len(app.languages))
if __name__ == "__main__": if __name__ == "__main__":
app.run(debug=True) app.run(debug=True)

View file

@ -87,14 +87,14 @@
<h1><a href="/">Wikimore</a></h1> <h1><a href="/">Wikimore</a></h1>
<form id="search-form" action="{{ url_for('search') }}" method="post"> <form id="search-form" action="{{ url_for('search') }}" method="post">
<select name="project" id="project"> <select name="project" id="project">
<option value="wikipedia">Wikipedia</option> {% for key, value in wikimedia_projects.items() %}
<option value="wiktionary">Wiktionary</option> <option value="{{ key }}">{{ value }}</option>
<!-- TODO: Add more projects --> {% endfor %}
</select> </select>
<select name="lang" id="lang"> <select name="lang" id="lang">
<option value="en">English</option> {% for key, value in languages.items() %}
<option value="de">German</option> <option value="{{ key }}">{{ value["name"] }}</option>
<!-- TODO: Add more languages --> {% endfor %}
</select> </select>
<input type="text" name="query" id="query" placeholder="Search Wikipedia" required> <input type="text" name="query" id="query" placeholder="Search Wikipedia" required>
<button type="submit">Search</button> <button type="submit">Search</button>
@ -104,7 +104,7 @@
{% block content %}{% endblock %} {% block content %}{% endblock %}
</div> </div>
<div id="footer"> <div id="footer">
<p>Brought to you by <a href="https://git.private.coffee/privatecoffee/wikimore">Wikimore</a></p> <p>Brought to you by <a href="https://git.private.coffee/privatecoffee/wikimore">Wikimore</a>, a <a href="https://private.coffee">Private.coffee</a> project</p>
</div> </div>
</body> </body>
</html> </html>