forked from PrivateCoffee/wikimore
feat: dynamic Wikimedia projects and logging enhancements
Implemented dynamic fetching of Wikimedia projects and languages from the Wikimedia API, replacing static definitions. Enhanced logging with a detailed and formatted log output for better debugging and monitoring. Updated templates to list projects and languages dynamically. Improved URL handling and proxying for robust performance. This change ensures up-to-date project information and enhances overall application transparency and maintainability.
This commit is contained in:
parent
fcb6a4aa96
commit
d4c60d98d9
2 changed files with 122 additions and 31 deletions
139
app.py
139
app.py
|
@ -1,17 +1,52 @@
|
|||
from flask import Flask, render_template, request, redirect, url_for
|
||||
import urllib.request
|
||||
from urllib.parse import urlencode
|
||||
from urllib.parse import urlencode, urlparse
|
||||
from html import escape
|
||||
import json
|
||||
import logging
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
WIKIMEDIA_PROJECTS = {
|
||||
"wikipedia": "wikipedia.org",
|
||||
"wiktionary": "wiktionary.org",
|
||||
# TODO: Add more Wikimedia projects
|
||||
}
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
handler = logging.StreamHandler()
|
||||
handler.setLevel(logging.DEBUG)
|
||||
logger.addHandler(handler)
|
||||
|
||||
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
||||
handler.setFormatter(formatter)
|
||||
|
||||
|
||||
def get_wikimedia_projects():
|
||||
url = "https://meta.wikimedia.org/w/api.php?action=sitematrix&format=json"
|
||||
with urllib.request.urlopen(url) as response:
|
||||
data = json.loads(response.read().decode())
|
||||
|
||||
projects = {}
|
||||
languages = {}
|
||||
|
||||
for key, value in data["sitematrix"].items():
|
||||
if key.isdigit():
|
||||
language = value["name"]
|
||||
language_code = value["code"]
|
||||
language_projects = {}
|
||||
|
||||
for site in value["site"]:
|
||||
language_projects[site["code"]] = site["url"]
|
||||
|
||||
if language_code == "en":
|
||||
projects[site["code"]] = site["sitename"]
|
||||
|
||||
if language_projects:
|
||||
languages[language_code] = {
|
||||
"projects": language_projects,
|
||||
"name": language,
|
||||
}
|
||||
|
||||
return projects, languages
|
||||
|
||||
|
||||
def get_proxy_url(url):
|
||||
|
@ -19,8 +54,10 @@ def get_proxy_url(url):
|
|||
url = "https:" + url
|
||||
|
||||
if not url.startswith("https://upload.wikimedia.org/"):
|
||||
logger.debug(f"Not generating proxy URL for {url}")
|
||||
return url
|
||||
|
||||
logger.debug(f"Generating proxy URL for {url}")
|
||||
return f"/proxy?{urlencode({'url': url})}"
|
||||
|
||||
|
||||
|
@ -29,8 +66,11 @@ def proxy():
|
|||
url = request.args.get("url")
|
||||
|
||||
if not url or not url.startswith("https://upload.wikimedia.org/"):
|
||||
logger.error(f"Invalid URL for proxying: {url}")
|
||||
return "Invalid URL"
|
||||
|
||||
logger.debug(f"Proxying {url}")
|
||||
|
||||
with urllib.request.urlopen(url) as response:
|
||||
data = response.read()
|
||||
return data
|
||||
|
@ -38,7 +78,11 @@ def proxy():
|
|||
|
||||
@app.route("/")
|
||||
def home():
|
||||
return render_template("home.html")
|
||||
return render_template(
|
||||
"home.html",
|
||||
wikimedia_projects=app.wikimedia_projects,
|
||||
languages=app.languages,
|
||||
)
|
||||
|
||||
|
||||
@app.route("/search", methods=["GET", "POST"])
|
||||
|
@ -50,35 +94,58 @@ def search():
|
|||
return redirect(
|
||||
url_for("search_results", project=project, lang=lang, query=query)
|
||||
)
|
||||
return render_template("search.html")
|
||||
return render_template(
|
||||
"search.html",
|
||||
wikimedia_projects=app.wikimedia_projects,
|
||||
languages=app.languages,
|
||||
)
|
||||
|
||||
|
||||
@app.route("/<project>/<lang>/wiki/<title>")
|
||||
def wiki_article(project, lang, title):
|
||||
base_url = WIKIMEDIA_PROJECTS.get(project, "wikipedia.org")
|
||||
url = f"https://{lang}.{base_url}/w/api.php?action=query&format=json&titles={escape(title.replace(" ", "_"), True)}&prop=revisions&rvprop=content&rvparse=1"
|
||||
language_projects = app.languages.get(lang, {}).get("projects", {})
|
||||
base_url = language_projects.get(project)
|
||||
|
||||
if not base_url:
|
||||
return "Invalid language or project"
|
||||
|
||||
logger.debug(f"Fetching {title} from {base_url}")
|
||||
|
||||
url = f"{base_url}/w/api.php?action=query&format=json&titles={escape(title.replace(" ", "_"), True)}&prop=revisions&rvprop=content&rvparse=1"
|
||||
with urllib.request.urlopen(url) as response:
|
||||
data = json.loads(response.read().decode())
|
||||
pages = data["query"]["pages"]
|
||||
article_html = next(iter(pages.values()))["revisions"][0]["*"]
|
||||
|
||||
soup = BeautifulSoup(article_html, "html.parser")
|
||||
|
||||
for a in soup.find_all("a", href=True):
|
||||
href = a["href"]
|
||||
|
||||
if href.startswith("/wiki/"):
|
||||
a["href"] = f"/{project}/{lang}{href}"
|
||||
|
||||
elif href.startswith("//") or href.startswith("https://"):
|
||||
parts = href.split("/")
|
||||
if len(parts) > 4:
|
||||
target_project = ".".join(parts[2].split(".")[1:])
|
||||
target_lang = parts[2].split(".")[0]
|
||||
target_title = "/".join(parts[4:])
|
||||
if target_project in WIKIMEDIA_PROJECTS.values():
|
||||
target_project = list(WIKIMEDIA_PROJECTS.keys())[
|
||||
list(WIKIMEDIA_PROJECTS.values()).index(target_project)
|
||||
]
|
||||
a["href"] = f"/{target_project}/{target_lang}/wiki/{target_title}"
|
||||
parts = urlparse(href)
|
||||
|
||||
target_domain = parts.netloc
|
||||
path_parts = parts.path.split("/")
|
||||
|
||||
if len(path_parts) > 4:
|
||||
target_title = "/".join(path_parts[4:])
|
||||
target_lang = target_domain.split(".")[0]
|
||||
|
||||
found = False
|
||||
for language, language_projects in app.languages.items():
|
||||
for project_name, project_url in language_projects.items():
|
||||
if target_domain == project_url:
|
||||
a["href"] = (
|
||||
f"/{project_name}/{target_lang}/wiki/{target_title}"
|
||||
)
|
||||
found = True
|
||||
break
|
||||
if found:
|
||||
break
|
||||
|
||||
for span in soup.find_all("span", class_="mw-editsection"):
|
||||
span.decompose()
|
||||
|
@ -89,19 +156,37 @@ def wiki_article(project, lang, title):
|
|||
for img in soup.find_all("img"):
|
||||
img["src"] = get_proxy_url(img["src"])
|
||||
|
||||
for source in soup.find_all("source"):
|
||||
source["src"] = get_proxy_url(source["src"])
|
||||
|
||||
for video in soup.find_all("video"):
|
||||
video["poster"] = get_proxy_url(video["poster"])
|
||||
|
||||
for li in soup.find_all("li"):
|
||||
# If "nv-view", "nv-talk", "nv-edit" classes are on the li element, remove it
|
||||
if any(cls in li.get("class", []) for cls in ["nv-view", "nv-talk", "nv-edit"]):
|
||||
li.decompose()
|
||||
|
||||
processed_html = str(soup)
|
||||
return render_template("article.html", title=title, content=processed_html)
|
||||
return render_template(
|
||||
"article.html",
|
||||
title=title,
|
||||
content=processed_html,
|
||||
wikimedia_projects=app.wikimedia_projects,
|
||||
languages=app.languages,
|
||||
)
|
||||
|
||||
|
||||
@app.route("/<project>/<lang>/search/<query>")
|
||||
def search_results(project, lang, query):
|
||||
base_url = WIKIMEDIA_PROJECTS.get(project, "wikipedia.org")
|
||||
url = f"https://{lang}.{base_url}/w/api.php?action=query&format=json&list=search&srsearch={query}"
|
||||
language_projects = app.languages.get(lang, {}).get("projects", {})
|
||||
base_url = language_projects.get(project)
|
||||
|
||||
if not base_url:
|
||||
return "Invalid language or project"
|
||||
|
||||
logger.debug(f"Searching {base_url} for {query}")
|
||||
|
||||
url = f"{base_url}/w/api.php?action=query&format=json&list=search&srsearch={query}"
|
||||
with urllib.request.urlopen(url) as response:
|
||||
data = json.loads(response.read().decode())
|
||||
search_results = data["query"]["search"]
|
||||
|
@ -111,6 +196,8 @@ def search_results(project, lang, query):
|
|||
search_results=search_results,
|
||||
project=project,
|
||||
lang=lang,
|
||||
wikimedia_projects=app.wikimedia_projects,
|
||||
languages=app.languages,
|
||||
)
|
||||
|
||||
|
||||
|
@ -119,5 +206,9 @@ def search_redirect(project, lang, query):
|
|||
return redirect(url_for("search_results", project=project, lang=lang, query=query))
|
||||
|
||||
|
||||
app.wikimedia_projects, app.languages = get_wikimedia_projects()
|
||||
|
||||
print(len(app.wikimedia_projects), len(app.languages))
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(debug=True)
|
||||
|
|
|
@ -87,14 +87,14 @@
|
|||
<h1><a href="/">Wikimore</a></h1>
|
||||
<form id="search-form" action="{{ url_for('search') }}" method="post">
|
||||
<select name="project" id="project">
|
||||
<option value="wikipedia">Wikipedia</option>
|
||||
<option value="wiktionary">Wiktionary</option>
|
||||
<!-- TODO: Add more projects -->
|
||||
{% for key, value in wikimedia_projects.items() %}
|
||||
<option value="{{ key }}">{{ value }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
<select name="lang" id="lang">
|
||||
<option value="en">English</option>
|
||||
<option value="de">German</option>
|
||||
<!-- TODO: Add more languages -->
|
||||
{% for key, value in languages.items() %}
|
||||
<option value="{{ key }}">{{ value["name"] }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
<input type="text" name="query" id="query" placeholder="Search Wikipedia" required>
|
||||
<button type="submit">Search</button>
|
||||
|
@ -104,7 +104,7 @@
|
|||
{% block content %}{% endblock %}
|
||||
</div>
|
||||
<div id="footer">
|
||||
<p>Brought to you by <a href="https://git.private.coffee/privatecoffee/wikimore">Wikimore</a></p>
|
||||
<p>Brought to you by <a href="https://git.private.coffee/privatecoffee/wikimore">Wikimore</a>, a <a href="https://private.coffee">Private.coffee</a> project</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
Loading…
Reference in a new issue