From 29592de90a4efa1e9c8e80902d46ab698845a20a Mon Sep 17 00:00:00 2001 From: Kumi Date: Tue, 16 Jan 2024 17:13:59 +0100 Subject: [PATCH] Refactor data fetching logic using stdlib Removed dependencies on external libraries such as `requests`, `requests-html`, and `playwright` in favor of Python's standard libraries like `urllib` for HTTP requests and parsing functionality. A more robust and efficient data update function was introduced to scrape API keys and fetch project data using Typesense. Transitioned from a manual browser-based scraping approach to an API-based one for improved stability and performance. Added logging for better monitoring and debuggability. Error-handling now leverages `HTTPError` from `urllib.error`. Shifted the configuration of debug mode to rely on an environment variable, aligning with Twelve-Factor principles. Removed unused functions and streamlined the handling of various routes within the Flask app. This change shifts the project towards a more maintainable code base by using built-in libraries, reduces external dependencies, and improves resilience and scalability of the web scraping components. --- main.py | 371 +++++++++++++++++++++++++---------------------- requirements.txt | 3 - 2 files changed, 201 insertions(+), 173 deletions(-) diff --git a/main.py b/main.py index fbb67e5..492492b 100644 --- a/main.py +++ b/main.py @@ -9,90 +9,121 @@ from flask import ( stream_with_context, ) -import requests -import re - -from bs4 import BeautifulSoup from urllib.parse import quote, unquote +from urllib.request import Request, urlopen +from urllib.error import HTTPError from traceback import print_exc -from requests_html import HTMLSession -from playwright.sync_api import sync_playwright -from urllib.parse import urljoin +from urllib.parse import urljoin, urlparse from argparse import ArgumentParser +from configparser import ConfigParser from werkzeug.exceptions import BadRequest, abort, InternalServerError, NotFound +from bs4 import BeautifulSoup import os +import json +import re +import logging + +logging.basicConfig(level=logging.DEBUG) global_ibles = {} -def proxy(src): - return "/proxy/?url=" + quote(str(src)) + +def proxy(url): + logging.debug(f"Generating proxy URL for {url}") + return f"/proxy/?url={url}" + + +def get_typesense_api_key(): + logging.debug("Getting Typesense API key...") + + data = urlopen("https://www.instructables.com/") + soup = BeautifulSoup(data.read().decode(), "html.parser") + scripts = soup.select("script") + + for script in scripts: + if "typesense" in script.text and ( + matches := re.search(r'"typesenseApiKey":\s?"(.*?)"', script.text) + ): + api_key = matches.group(1) + logging.debug(f"Identified Typesense API key as {api_key}") + return api_key + + logging.error("Failed to get Typesense API key") + + +TYPESENSE_API_KEY = get_typesense_api_key() + + +def projects_search( + query="*", + category="", + channel="", + filter_by="featureFlag:=true", + page=1, + per_page=50, +): + if category: + if filter_by: + filter_by += " && " + filter_by += f"category:={category}" + + if channel: + if filter_by: + filter_by += " && " + filter_by += f"channel:={channel}" + + query = quote(query) + filter_by = quote(filter_by) + + logging.debug(f"Searching projects with query {query} and filter {filter_by}") + + projects_headers = {"x-typesense-api-key": TYPESENSE_API_KEY} + projects_request = Request( + f"https://www.instructables.com/api_proxy/search/collections/projects/documents/search?q={query}&query_by=title,stepBody,screenName&page={page}&sort_by=publishDate:desc&include_fields=title,urlString,coverImageUrl,screenName,favorites,views,primaryClassification,featureFlag,prizeLevel,IMadeItCount&filter_by={filter_by}&per_page={per_page}", + headers=projects_headers, + ) + + projects_data = urlopen(projects_request) + project_obj = json.loads(projects_data.read().decode()) + project_ibles = project_obj["hits"] + + logging.debug(f"Got {len(project_ibles)} projects") + + return project_ibles + def update_data(): - playwright = sync_playwright().start() - browser = playwright.chromium.launch(headless=True) - page = browser.new_page() + logging.debug("Updating data...") channels = [] - data = requests.get(f"https://www.instructables.com/sitemap/") - - soup = BeautifulSoup(data.text, "html.parser") - - main = soup.select("div.sitemap-content")[0] + sitemap_data = urlopen("https://www.instructables.com/sitemap/") + sitemap_soup = BeautifulSoup(sitemap_data.read().decode(), "html.parser") + main = sitemap_soup.select("div.sitemap-content")[0] groups = [] for group in main.select("div.group-section"): channels.append(group.select("h2 a")[0].text.lower()) global_ibles["/projects"] = [] - - page.goto("https://www.instructables.com/projects") + project_ibles = projects_search() while len(global_ibles["/projects"]) <= 0: - for ible in page.query_selector_all(".ibleCard__QPJVm"): - link = ( - ible.query_selector("a") - .get_attribute("href") - .replace("https://www.instructables.com", "{instance_root_url}") - ) - img = proxy(ible.query_selector("img").get_attribute("src")) + for ible in project_ibles: + link = f"/{ible['document']['urlString']}" + img = proxy(ible["document"]["coverImageUrl"]) - title = ible.query_selector(".title__t0fGQ").inner_text() - author = ible.query_selector("a[href^='/member/']").inner_text() - author_link = ( - ible.query_selector("a[href^='/member/']") - .get_attribute("href") - .replace("https://www.instructables.com", "{instance_root_url}") - ) + title = ible["document"]["title"] + author = ible["document"]["screenName"] + author_link = f"/member/{author}" - for c in channels: - try: - channel = ible.query_selector("a[href^='/" + c + "']").inner_text() - channel_link = ( - ible.query_selector("a[href^='/" + c + "']") - .get_attribute("href") - .replace("https://www.instructables.com", "{instance_root_url}") - ) - except: - try: - channel = ible.query_selector("a[href^='/projects/']").inner_text() - channel_link = ( - ible.query_selector("a[href^='/projects/']") - .get_attribute("href") - .replace("https://www.instructables.com", "{instance_root_url}") - ) - except: - pass + channel = ible["document"]["primaryClassification"] + channel_link = f"/channel/{channel}" - stats = ible.query_selector(".stats__GFKyl") - views = 0 - if stats.query_selector("div[title$=' views']"): - views = stats.query_selector("div[title$=' views']").inner_text() - favorites = 0 - if stats.query_selector("div[title$=' favorites']"): - favorites = stats.query_selector("div[title$=' favorites']").inner_text() + views = ible["document"]["views"] + favorites = ible["document"]["favorites"] global_ibles["/projects"].append( [ @@ -108,10 +139,8 @@ def update_data(): ] ) - browser.close() - playwright.stop() -debugmode = False +debugmode = os.environ.get("FLASK_DEBUG", False) if __name__ == "__main__": parser = ArgumentParser() @@ -135,7 +164,7 @@ if __name__ == "__main__": help="Host to listen on", ) args = parser.parse_args() - + if args.debug: debugmode = True @@ -147,14 +176,16 @@ print("Started!") app = Flask(__name__, template_folder="templates", static_folder="static") -def get_instance_root_url(request): - return request.url_root +if debugmode: + app.logger.setLevel(logging.DEBUG) + @app.route("/cron/") def cron(): update_data() return "OK" + def explore_lists(soup): list_ = [] for ible in soup.select(".home-content-explore-ible"): @@ -199,8 +230,6 @@ def member_header(header): profile_top = header.select("div.profile-top")[0] - print(header.encode_contents()) - # stats_text = profile_top.select("div.profile-header-stats")[0] # stats_num = header.select("div.profile-top div.profile-header-stats")[1] @@ -260,11 +289,12 @@ def member_header(header): def category_page(path, name, teachers=False): - data = requests.get("https://www.instructables.com" + path) - if data.status_code != 200: - abort(data.status_code) + try: + data = urlopen("https://www.instructables.com" + path) + except HTTPError as e: + abort(e.code) - soup = BeautifulSoup(data.text, "html.parser") + soup = BeautifulSoup(data.read().decode(), "html.parser") channels = [] for card in soup.select("div.scrollable-cards-inner div.scrollable-card"): @@ -328,70 +358,39 @@ def category_page(path, name, teachers=False): def project_list(path, head, sort=""): - playwright = sync_playwright().start() - browser = playwright.chromium.launch(headless=True) - page = browser.new_page() - - page.goto(urljoin("https://www.instructables.com", path)) - head = f"{head + ' ' if head != '' else ''}Projects" + sort - path_ = path.rsplit("/", 1)[0] + path = urlparse(path).path - if path == "/projects/" or path == "/projects": + if path in ("/projects/", "/projects"): ibles = global_ibles["/projects"] else: + if not "projects" in path.split("/"): + abort(404) + ibles = [] - for ible in page.query_selector_all(".ibleCard__QPJVm"): - link = ( - ible.query_selector("a") - .get_attribute("href") - .replace("https://www.instructables.com", "{instance_root_url}") - ) - img = proxy( - ible.find_elements(By.CSS_SELECTOR, "img")[0].get_attribute("src") - ) + parts = path.split("/") - title = ible.find_elements(By.CLASS_NAME, "title__t0fGQ")[0].text - author = ible.find_elements(By.CSS_SELECTOR, "a[href^='/member/']")[0].text - author_link = ( - ible.find_elements(By.CSS_SELECTOR, "a[href^='/member/']")[0] - .get_attribute("href") - .replace("https://www.instructables.com", "{instance_root_url}") - ) + category = parts[1] + channel = "" if parts[2] == "projects" else parts[2] - channel = "TEST" - channel_link = "TEST" + # TODO: Add pagination, popular, etc. - for c in channels: - try: - channel = ible.query_selector("a[href^='/" + c + "']").inner_text() - channel_link = ( - ible.query_selector("a[href^='/" + c + "']") - .get_attribute("href") - .replace("https://www.instructables.com", "{instance_root_url}") - ) - except: - try: - channel = ible.query_selector("a[href^='/projects/'] span").inner_text() - channel_link = ( - ible.query_selector("a[href^='/projects/']") - .get_attribute("href") - .replace("https://www.instructables.com", "{instance_root_url}") - ) - except: - pass + project_ibles = projects_search(category=category, channel=channel) - stats = ible.query_selector(".stats__GFKyl") - views = 0 + for ible in project_ibles: + link = f"/{ible['document']['urlString']}" + img = proxy(ible["document"]["coverImageUrl"]) - if stats.query_selector("div[title$=' views']"): - views = stats.query_selector("div[title$=' views']").inner_text() + title = ible["document"]["title"] + author = ible["document"]["screenName"] + author_link = f"/member/{author}" - favorites = 0 + channel = ible["document"]["primaryClassification"] + channel_link = f"/channel/{channel}" - if stats.query_selector("div[title$=' favorites']"): - favorites = stats.query_selector("div[title$=' favorites']").inner_text() + views = ible["document"]["views"] + favorites = ible["document"]["favorites"] ibles.append( [ @@ -410,31 +409,43 @@ def project_list(path, head, sort=""): if len(ibles) >= 8: break - browser.close() - playwright.stop() + return render_template("projects.html", data=[head, ibles, path]) - return render_template("projects.html", data=[head, ibles, path_]) @app.route("/sitemap/") -def route_sitemap(): - data = requests.get(f"https://www.instructables.com/sitemap/") - if data.status_code != 200: - abort(data.status_code) +@app.route("/sitemap/") +def route_sitemap(path=""): + try: + data = urlopen("https://www.instructables.com/sitemap/" + path) + except HTTPError as e: + abort(e.code) - soup = BeautifulSoup(data.text, "html.parser") + soup = BeautifulSoup(data.read().decode(), "html.parser") main = soup.select("div.sitemap-content")[0] - groups = [] - for group in main.select("div.group-section"): - category = group.select("h2 a")[0].text - category_link = group.select("h2 a")[0].get("href") + group_section = main.select("div.group-section") + + if group_section: + groups = [] + for group in group_section: + category = group.select("h2 a")[0].text + category_link = group.select("h2 a")[0].get("href") + channels = [] + for li in group.select("ul.sitemap-listing li"): + channel = li.a.text + channel_link = li.a["href"] + channels.append([channel, channel_link]) + groups.append([category, category_link, channels]) + + else: + groups = [] channels = [] - for li in group.select("ul.sitemap-listing li"): + for li in main.select("ul.sitemap-listing li"): channel = li.a.text channel_link = li.a["href"] channels.append([channel, channel_link]) - groups.append([category, category_link, channels]) + groups.append(["", "", channels]) return render_template("sitemap.html", data=groups) @@ -444,11 +455,13 @@ def route_contest_archive(): page = 1 if request.args.get("page") != None: page = request.args.get("page") - data = requests.get(f"https://www.instructables.com/contest/archive/?page={page}") - if data.status_code != 200: - abort(data.status_code) - soup = BeautifulSoup(data.text, "html.parser") + try: + data = urlopen(f"https://www.instructables.com/contest/archive/?page={page}") + except HTTPError as e: + abort(e.code) + + soup = BeautifulSoup(data.read().decode(), "html.parser") main = soup.select("div#contest-archive-wrapper")[0] @@ -481,11 +494,12 @@ def route_contest_archive(): @app.route("/contest//") def route_contest(contest): - data = requests.get(f"https://www.instructables.com/contest/{contest}/") - if data.status_code != 200: - abort(data.status_code) + try: + data = urlopen(f"https://www.instructables.com/contest/{contest}/") + except HTTPError as e: + abort(e.code) - soup = BeautifulSoup(data.text, "html.parser") + soup = BeautifulSoup(data.read().decode(), "html.parser") title = soup.select('meta[property="og:title"]')[0].get("content") @@ -500,7 +514,7 @@ def route_contest(contest): info.select("div#site-announcements-page")[0].decompose() info.select("h3")[0].decompose() info.select("div#contest-body-nav")[0].decompose() - info = str(info).replace("https://www.instructables.com", "{instance_root_url}") + info = str(info).replace("https://www.instructables.com", "/") entries = body.select("span.contest-entity-count")[0].text @@ -535,11 +549,12 @@ def route_contest(contest): @app.route("/contest/") def route_contests(): - data = requests.get("https://www.instructables.com/contest/") - if data.status_code != 200: - abort(data.status_code) + try: + data = urlopen("https://www.instructables.com/contest/") + except HTTPError as e: + abort(e.code) - soup = BeautifulSoup(data.text, "html.parser") + soup = BeautifulSoup(data.read().decode(), "html.parser") contest_count = str(soup.select("p.contest-count")[0]) @@ -660,11 +675,12 @@ def route_sitemap_circuits(category, subcategory): @app.route("/member//instructables/") def route_member_instructables(member): - data = requests.get(f"https://www.instructables.com/member/{member}/instructables") - if data.status_code != 200: - abort(data.status_code) + try: + data = urlopen(f"https://www.instructables.com/member/{member}/instructables/") + except HTTPError as e: + abort(e.code) - soup = BeautifulSoup(data.text, "html.parser") + soup = BeautifulSoup(data.read().decode(), "html.parser") header = soup.select(".profile-header.profile-header-social")[0] header_content = member_header(header) @@ -697,13 +713,16 @@ def route_member(member): "User-Agent": "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/113.0" } - data = requests.get( + request = Request( f"https://www.instructables.com/member/{member}/", headers=headers ) - if data.status_code != 200: - abort(data.status_code) - soup = BeautifulSoup(data.text, "html.parser") + try: + data = urlopen(request) + except HTTPError as e: + abort(e.code) + + soup = BeautifulSoup(data.read().decode(), "html.parser") header_content = member_header(soup) @@ -753,11 +772,12 @@ def route_member(member): @app.route("/
/") def route_article(article): - data = requests.get(f"https://www.instructables.com/{article}/") - if data.status_code != 200: - abort(data.status_code) + try: + data = urlopen(f"https://www.instructables.com/{article}/") + except HTTPError as e: + abort(e.code) - soup = BeautifulSoup(data.text, "html.parser") + soup = BeautifulSoup(data.read().decode(), "html.parser") try: header = soup.select("header") @@ -800,9 +820,7 @@ def route_article(article): step_text = str(step.select("div.step-body")[0]) step_text = step_text.replace( "https://content.instructables.com", - "{instance_root_url}/proxy/?url=https://content.instructables.com".format( - instance_root_url=get_instance_root_url(request) - ), + "/proxy/?url=https://content.instructables.com", ) steps.append([step_title, step_imgs, step_text, step_videos]) @@ -941,6 +959,7 @@ def route_article(article): @app.route("///") def route_channel_redirect(category, channel): + # TODO: Just check if the channel exists if ( category == "circuits" or category == "workshop" @@ -957,11 +976,12 @@ def route_channel_redirect(category, channel): @app.route("/") def route_explore(): - data = requests.get("https://www.instructables.com/") - if data.status_code != 200: - abort(data.status_code) + try: + data = urlopen("https://www.instructables.com/") + except HTTPError as e: + abort(e.code) - soup = BeautifulSoup(data.text, "html.parser") + soup = BeautifulSoup(data.read().decode(), "html.parser") explore = soup.select(".home-content-explore-wrap")[0] @@ -994,32 +1014,43 @@ def route_proxy(): if url.startswith("https://cdn.instructables.com/") or url.startswith( "https://content.instructables.com/" ): - data = requests.get(unquote(url)) - return Response(data.content, content_type=data.headers["content-type"]) + try: + data = urlopen(unquote(url)) + except HTTPError as e: + abort(e.code) + + return Response(data.read(), content_type=data.headers["content-type"]) else: raise BadRequest() else: raise BadRequest() + @app.route("/privacypolicy/") def privacypolicy(): + # TODO: Make this dynamic return render_template("privacypolicy.html") + @app.errorhandler(404) def not_found(e): return render_template("404.html") + @app.errorhandler(400) def bad_request(e): return render_template("400.html") + @app.errorhandler(429) def too_many_requests(e): return render_template("429.html") + @app.errorhandler(500) def internal_server_error(e): return render_template("500.html") -if __name__ == '__main__': + +if __name__ == "__main__": app.run(port=args.port, host=args.listen_host, debug=debugmode) diff --git a/requirements.txt b/requirements.txt index 72ac828..29a1f88 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,2 @@ bs4 -requests flask -requests-html -playwright