diff --git a/app.py b/app.py index 2ef47f8..1e129a6 100644 --- a/app.py +++ b/app.py @@ -1,17 +1,52 @@ from flask import Flask, render_template, request, redirect, url_for import urllib.request -from urllib.parse import urlencode +from urllib.parse import urlencode, urlparse from html import escape import json +import logging from bs4 import BeautifulSoup app = Flask(__name__) -WIKIMEDIA_PROJECTS = { - "wikipedia": "wikipedia.org", - "wiktionary": "wiktionary.org", - # TODO: Add more Wikimedia projects -} +logging.basicConfig(level=logging.DEBUG) + +logger = logging.getLogger(__name__) + +handler = logging.StreamHandler() +handler.setLevel(logging.DEBUG) +logger.addHandler(handler) + +formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") +handler.setFormatter(formatter) + + +def get_wikimedia_projects(): + url = "https://meta.wikimedia.org/w/api.php?action=sitematrix&format=json" + with urllib.request.urlopen(url) as response: + data = json.loads(response.read().decode()) + + projects = {} + languages = {} + + for key, value in data["sitematrix"].items(): + if key.isdigit(): + language = value["name"] + language_code = value["code"] + language_projects = {} + + for site in value["site"]: + language_projects[site["code"]] = site["url"] + + if language_code == "en": + projects[site["code"]] = site["sitename"] + + if language_projects: + languages[language_code] = { + "projects": language_projects, + "name": language, + } + + return projects, languages def get_proxy_url(url): @@ -19,8 +54,10 @@ def get_proxy_url(url): url = "https:" + url if not url.startswith("https://upload.wikimedia.org/"): + logger.debug(f"Not generating proxy URL for {url}") return url + logger.debug(f"Generating proxy URL for {url}") return f"/proxy?{urlencode({'url': url})}" @@ -29,8 +66,11 @@ def proxy(): url = request.args.get("url") if not url or not url.startswith("https://upload.wikimedia.org/"): + logger.error(f"Invalid URL for proxying: {url}") return "Invalid URL" + logger.debug(f"Proxying {url}") + with urllib.request.urlopen(url) as response: data = response.read() return data @@ -38,7 +78,11 @@ def proxy(): @app.route("/") def home(): - return render_template("home.html") + return render_template( + "home.html", + wikimedia_projects=app.wikimedia_projects, + languages=app.languages, + ) @app.route("/search", methods=["GET", "POST"]) @@ -50,35 +94,58 @@ def search(): return redirect( url_for("search_results", project=project, lang=lang, query=query) ) - return render_template("search.html") + return render_template( + "search.html", + wikimedia_projects=app.wikimedia_projects, + languages=app.languages, + ) @app.route("///wiki/") def wiki_article(project, lang, title): - base_url = WIKIMEDIA_PROJECTS.get(project, "wikipedia.org") - url = f"https://{lang}.{base_url}/w/api.php?action=query&format=json&titles={escape(title.replace(" ", "_"), True)}&prop=revisions&rvprop=content&rvparse=1" + language_projects = app.languages.get(lang, {}).get("projects", {}) + base_url = language_projects.get(project) + + if not base_url: + return "Invalid language or project" + + logger.debug(f"Fetching {title} from {base_url}") + + url = f"{base_url}/w/api.php?action=query&format=json&titles={escape(title.replace(" ", "_"), True)}&prop=revisions&rvprop=content&rvparse=1" with urllib.request.urlopen(url) as response: data = json.loads(response.read().decode()) pages = data["query"]["pages"] article_html = next(iter(pages.values()))["revisions"][0]["*"] soup = BeautifulSoup(article_html, "html.parser") + for a in soup.find_all("a", href=True): href = a["href"] + if href.startswith("/wiki/"): a["href"] = f"/{project}/{lang}{href}" elif href.startswith("//") or href.startswith("https://"): - parts = href.split("/") - if len(parts) > 4: - target_project = ".".join(parts[2].split(".")[1:]) - target_lang = parts[2].split(".")[0] - target_title = "/".join(parts[4:]) - if target_project in WIKIMEDIA_PROJECTS.values(): - target_project = list(WIKIMEDIA_PROJECTS.keys())[ - list(WIKIMEDIA_PROJECTS.values()).index(target_project) - ] - a["href"] = f"/{target_project}/{target_lang}/wiki/{target_title}" + parts = urlparse(href) + + target_domain = parts.netloc + path_parts = parts.path.split("/") + + if len(path_parts) > 4: + target_title = "/".join(path_parts[4:]) + target_lang = target_domain.split(".")[0] + + found = False + for language, language_projects in app.languages.items(): + for project_name, project_url in language_projects.items(): + if target_domain == project_url: + a["href"] = ( + f"/{project_name}/{target_lang}/wiki/{target_title}" + ) + found = True + break + if found: + break for span in soup.find_all("span", class_="mw-editsection"): span.decompose() @@ -89,19 +156,37 @@ def wiki_article(project, lang, title): for img in soup.find_all("img"): img["src"] = get_proxy_url(img["src"]) + for source in soup.find_all("source"): + source["src"] = get_proxy_url(source["src"]) + + for video in soup.find_all("video"): + video["poster"] = get_proxy_url(video["poster"]) + for li in soup.find_all("li"): - # If "nv-view", "nv-talk", "nv-edit" classes are on the li element, remove it if any(cls in li.get("class", []) for cls in ["nv-view", "nv-talk", "nv-edit"]): li.decompose() processed_html = str(soup) - return render_template("article.html", title=title, content=processed_html) + return render_template( + "article.html", + title=title, + content=processed_html, + wikimedia_projects=app.wikimedia_projects, + languages=app.languages, + ) @app.route("/<project>/<lang>/search/<query>") def search_results(project, lang, query): - base_url = WIKIMEDIA_PROJECTS.get(project, "wikipedia.org") - url = f"https://{lang}.{base_url}/w/api.php?action=query&format=json&list=search&srsearch={query}" + language_projects = app.languages.get(lang, {}).get("projects", {}) + base_url = language_projects.get(project) + + if not base_url: + return "Invalid language or project" + + logger.debug(f"Searching {base_url} for {query}") + + url = f"{base_url}/w/api.php?action=query&format=json&list=search&srsearch={query}" with urllib.request.urlopen(url) as response: data = json.loads(response.read().decode()) search_results = data["query"]["search"] @@ -111,6 +196,8 @@ def search_results(project, lang, query): search_results=search_results, project=project, lang=lang, + wikimedia_projects=app.wikimedia_projects, + languages=app.languages, ) @@ -119,5 +206,9 @@ def search_redirect(project, lang, query): return redirect(url_for("search_results", project=project, lang=lang, query=query)) +app.wikimedia_projects, app.languages = get_wikimedia_projects() + +print(len(app.wikimedia_projects), len(app.languages)) + if __name__ == "__main__": app.run(debug=True) diff --git a/templates/base.html b/templates/base.html index 268f451..391710c 100644 --- a/templates/base.html +++ b/templates/base.html @@ -87,14 +87,14 @@ <h1><a href="/">Wikimore</a></h1> <form id="search-form" action="{{ url_for('search') }}" method="post"> <select name="project" id="project"> - <option value="wikipedia">Wikipedia</option> - <option value="wiktionary">Wiktionary</option> - <!-- TODO: Add more projects --> + {% for key, value in wikimedia_projects.items() %} + <option value="{{ key }}">{{ value }}</option> + {% endfor %} </select> <select name="lang" id="lang"> - <option value="en">English</option> - <option value="de">German</option> - <!-- TODO: Add more languages --> + {% for key, value in languages.items() %} + <option value="{{ key }}">{{ value["name"] }}</option> + {% endfor %} </select> <input type="text" name="query" id="query" placeholder="Search Wikipedia" required> <button type="submit">Search</button> @@ -104,7 +104,7 @@ {% block content %}{% endblock %} </div> <div id="footer"> - <p>Brought to you by <a href="https://git.private.coffee/privatecoffee/wikimore">Wikimore</a></p> + <p>Brought to you by <a href="https://git.private.coffee/privatecoffee/wikimore">Wikimore</a>, a <a href="https://private.coffee">Private.coffee</a> project</p> </div> </body> </html> \ No newline at end of file