Refactor data fetching logic using stdlib

Removed dependencies on external libraries such as `requests`, `requests-html`, and `playwright` in favor of Python's standard libraries like `urllib` for HTTP requests and parsing functionality. A more robust and efficient data update function was introduced to scrape API keys and fetch project data using Typesense. Transitioned from a manual browser-based scraping approach to an API-based one for improved stability and performance. Added logging for better monitoring and debuggability. Error-handling now leverages `HTTPError` from `urllib.error`. Shifted the configuration of debug mode to rely on an environment variable, aligning with Twelve-Factor principles. Removed unused functions and streamlined the handling of various routes within the Flask app. This change shifts the project towards a more maintainable code base by using built-in libraries, reduces external dependencies, and improves resilience and scalability of the web scraping components.
This commit is contained in:
Kumi 2024-01-16 17:13:59 +01:00
parent d269a9992e
commit 29592de90a
Signed by: kumi
GPG key ID: ECBCC9082395383F
2 changed files with 201 additions and 173 deletions

371
main.py
View file

@ -9,90 +9,121 @@ from flask import (
stream_with_context, stream_with_context,
) )
import requests
import re
from bs4 import BeautifulSoup
from urllib.parse import quote, unquote from urllib.parse import quote, unquote
from urllib.request import Request, urlopen
from urllib.error import HTTPError
from traceback import print_exc from traceback import print_exc
from requests_html import HTMLSession from urllib.parse import urljoin, urlparse
from playwright.sync_api import sync_playwright
from urllib.parse import urljoin
from argparse import ArgumentParser from argparse import ArgumentParser
from configparser import ConfigParser
from werkzeug.exceptions import BadRequest, abort, InternalServerError, NotFound from werkzeug.exceptions import BadRequest, abort, InternalServerError, NotFound
from bs4 import BeautifulSoup
import os import os
import json
import re
import logging
logging.basicConfig(level=logging.DEBUG)
global_ibles = {} global_ibles = {}
def proxy(src):
return "/proxy/?url=" + quote(str(src)) def proxy(url):
logging.debug(f"Generating proxy URL for {url}")
return f"/proxy/?url={url}"
def get_typesense_api_key():
logging.debug("Getting Typesense API key...")
data = urlopen("https://www.instructables.com/")
soup = BeautifulSoup(data.read().decode(), "html.parser")
scripts = soup.select("script")
for script in scripts:
if "typesense" in script.text and (
matches := re.search(r'"typesenseApiKey":\s?"(.*?)"', script.text)
):
api_key = matches.group(1)
logging.debug(f"Identified Typesense API key as {api_key}")
return api_key
logging.error("Failed to get Typesense API key")
TYPESENSE_API_KEY = get_typesense_api_key()
def projects_search(
query="*",
category="",
channel="",
filter_by="featureFlag:=true",
page=1,
per_page=50,
):
if category:
if filter_by:
filter_by += " && "
filter_by += f"category:={category}"
if channel:
if filter_by:
filter_by += " && "
filter_by += f"channel:={channel}"
query = quote(query)
filter_by = quote(filter_by)
logging.debug(f"Searching projects with query {query} and filter {filter_by}")
projects_headers = {"x-typesense-api-key": TYPESENSE_API_KEY}
projects_request = Request(
f"https://www.instructables.com/api_proxy/search/collections/projects/documents/search?q={query}&query_by=title,stepBody,screenName&page={page}&sort_by=publishDate:desc&include_fields=title,urlString,coverImageUrl,screenName,favorites,views,primaryClassification,featureFlag,prizeLevel,IMadeItCount&filter_by={filter_by}&per_page={per_page}",
headers=projects_headers,
)
projects_data = urlopen(projects_request)
project_obj = json.loads(projects_data.read().decode())
project_ibles = project_obj["hits"]
logging.debug(f"Got {len(project_ibles)} projects")
return project_ibles
def update_data(): def update_data():
playwright = sync_playwright().start() logging.debug("Updating data...")
browser = playwright.chromium.launch(headless=True)
page = browser.new_page()
channels = [] channels = []
data = requests.get(f"https://www.instructables.com/sitemap/") sitemap_data = urlopen("https://www.instructables.com/sitemap/")
sitemap_soup = BeautifulSoup(sitemap_data.read().decode(), "html.parser")
soup = BeautifulSoup(data.text, "html.parser") main = sitemap_soup.select("div.sitemap-content")[0]
main = soup.select("div.sitemap-content")[0]
groups = [] groups = []
for group in main.select("div.group-section"): for group in main.select("div.group-section"):
channels.append(group.select("h2 a")[0].text.lower()) channels.append(group.select("h2 a")[0].text.lower())
global_ibles["/projects"] = [] global_ibles["/projects"] = []
project_ibles = projects_search()
page.goto("https://www.instructables.com/projects")
while len(global_ibles["/projects"]) <= 0: while len(global_ibles["/projects"]) <= 0:
for ible in page.query_selector_all(".ibleCard__QPJVm"): for ible in project_ibles:
link = ( link = f"/{ible['document']['urlString']}"
ible.query_selector("a") img = proxy(ible["document"]["coverImageUrl"])
.get_attribute("href")
.replace("https://www.instructables.com", "{instance_root_url}")
)
img = proxy(ible.query_selector("img").get_attribute("src"))
title = ible.query_selector(".title__t0fGQ").inner_text() title = ible["document"]["title"]
author = ible.query_selector("a[href^='/member/']").inner_text() author = ible["document"]["screenName"]
author_link = ( author_link = f"/member/{author}"
ible.query_selector("a[href^='/member/']")
.get_attribute("href")
.replace("https://www.instructables.com", "{instance_root_url}")
)
for c in channels: channel = ible["document"]["primaryClassification"]
try: channel_link = f"/channel/{channel}"
channel = ible.query_selector("a[href^='/" + c + "']").inner_text()
channel_link = (
ible.query_selector("a[href^='/" + c + "']")
.get_attribute("href")
.replace("https://www.instructables.com", "{instance_root_url}")
)
except:
try:
channel = ible.query_selector("a[href^='/projects/']").inner_text()
channel_link = (
ible.query_selector("a[href^='/projects/']")
.get_attribute("href")
.replace("https://www.instructables.com", "{instance_root_url}")
)
except:
pass
stats = ible.query_selector(".stats__GFKyl") views = ible["document"]["views"]
views = 0 favorites = ible["document"]["favorites"]
if stats.query_selector("div[title$=' views']"):
views = stats.query_selector("div[title$=' views']").inner_text()
favorites = 0
if stats.query_selector("div[title$=' favorites']"):
favorites = stats.query_selector("div[title$=' favorites']").inner_text()
global_ibles["/projects"].append( global_ibles["/projects"].append(
[ [
@ -108,10 +139,8 @@ def update_data():
] ]
) )
browser.close()
playwright.stop()
debugmode = False debugmode = os.environ.get("FLASK_DEBUG", False)
if __name__ == "__main__": if __name__ == "__main__":
parser = ArgumentParser() parser = ArgumentParser()
@ -135,7 +164,7 @@ if __name__ == "__main__":
help="Host to listen on", help="Host to listen on",
) )
args = parser.parse_args() args = parser.parse_args()
if args.debug: if args.debug:
debugmode = True debugmode = True
@ -147,14 +176,16 @@ print("Started!")
app = Flask(__name__, template_folder="templates", static_folder="static") app = Flask(__name__, template_folder="templates", static_folder="static")
def get_instance_root_url(request): if debugmode:
return request.url_root app.logger.setLevel(logging.DEBUG)
@app.route("/cron/") @app.route("/cron/")
def cron(): def cron():
update_data() update_data()
return "OK" return "OK"
def explore_lists(soup): def explore_lists(soup):
list_ = [] list_ = []
for ible in soup.select(".home-content-explore-ible"): for ible in soup.select(".home-content-explore-ible"):
@ -199,8 +230,6 @@ def member_header(header):
profile_top = header.select("div.profile-top")[0] profile_top = header.select("div.profile-top")[0]
print(header.encode_contents())
# stats_text = profile_top.select("div.profile-header-stats")[0] # stats_text = profile_top.select("div.profile-header-stats")[0]
# stats_num = header.select("div.profile-top div.profile-header-stats")[1] # stats_num = header.select("div.profile-top div.profile-header-stats")[1]
@ -260,11 +289,12 @@ def member_header(header):
def category_page(path, name, teachers=False): def category_page(path, name, teachers=False):
data = requests.get("https://www.instructables.com" + path) try:
if data.status_code != 200: data = urlopen("https://www.instructables.com" + path)
abort(data.status_code) except HTTPError as e:
abort(e.code)
soup = BeautifulSoup(data.text, "html.parser") soup = BeautifulSoup(data.read().decode(), "html.parser")
channels = [] channels = []
for card in soup.select("div.scrollable-cards-inner div.scrollable-card"): for card in soup.select("div.scrollable-cards-inner div.scrollable-card"):
@ -328,70 +358,39 @@ def category_page(path, name, teachers=False):
def project_list(path, head, sort=""): def project_list(path, head, sort=""):
playwright = sync_playwright().start()
browser = playwright.chromium.launch(headless=True)
page = browser.new_page()
page.goto(urljoin("https://www.instructables.com", path))
head = f"{head + ' ' if head != '' else ''}Projects" + sort head = f"{head + ' ' if head != '' else ''}Projects" + sort
path_ = path.rsplit("/", 1)[0] path = urlparse(path).path
if path == "/projects/" or path == "/projects": if path in ("/projects/", "/projects"):
ibles = global_ibles["/projects"] ibles = global_ibles["/projects"]
else: else:
if not "projects" in path.split("/"):
abort(404)
ibles = [] ibles = []
for ible in page.query_selector_all(".ibleCard__QPJVm"): parts = path.split("/")
link = (
ible.query_selector("a")
.get_attribute("href")
.replace("https://www.instructables.com", "{instance_root_url}")
)
img = proxy(
ible.find_elements(By.CSS_SELECTOR, "img")[0].get_attribute("src")
)
title = ible.find_elements(By.CLASS_NAME, "title__t0fGQ")[0].text category = parts[1]
author = ible.find_elements(By.CSS_SELECTOR, "a[href^='/member/']")[0].text channel = "" if parts[2] == "projects" else parts[2]
author_link = (
ible.find_elements(By.CSS_SELECTOR, "a[href^='/member/']")[0]
.get_attribute("href")
.replace("https://www.instructables.com", "{instance_root_url}")
)
channel = "TEST" # TODO: Add pagination, popular, etc.
channel_link = "TEST"
for c in channels: project_ibles = projects_search(category=category, channel=channel)
try:
channel = ible.query_selector("a[href^='/" + c + "']").inner_text()
channel_link = (
ible.query_selector("a[href^='/" + c + "']")
.get_attribute("href")
.replace("https://www.instructables.com", "{instance_root_url}")
)
except:
try:
channel = ible.query_selector("a[href^='/projects/'] span").inner_text()
channel_link = (
ible.query_selector("a[href^='/projects/']")
.get_attribute("href")
.replace("https://www.instructables.com", "{instance_root_url}")
)
except:
pass
stats = ible.query_selector(".stats__GFKyl") for ible in project_ibles:
views = 0 link = f"/{ible['document']['urlString']}"
img = proxy(ible["document"]["coverImageUrl"])
if stats.query_selector("div[title$=' views']"): title = ible["document"]["title"]
views = stats.query_selector("div[title$=' views']").inner_text() author = ible["document"]["screenName"]
author_link = f"/member/{author}"
favorites = 0 channel = ible["document"]["primaryClassification"]
channel_link = f"/channel/{channel}"
if stats.query_selector("div[title$=' favorites']"): views = ible["document"]["views"]
favorites = stats.query_selector("div[title$=' favorites']").inner_text() favorites = ible["document"]["favorites"]
ibles.append( ibles.append(
[ [
@ -410,31 +409,43 @@ def project_list(path, head, sort=""):
if len(ibles) >= 8: if len(ibles) >= 8:
break break
browser.close() return render_template("projects.html", data=[head, ibles, path])
playwright.stop()
return render_template("projects.html", data=[head, ibles, path_])
@app.route("/sitemap/") @app.route("/sitemap/")
def route_sitemap(): @app.route("/sitemap/<path:path>")
data = requests.get(f"https://www.instructables.com/sitemap/") def route_sitemap(path=""):
if data.status_code != 200: try:
abort(data.status_code) data = urlopen("https://www.instructables.com/sitemap/" + path)
except HTTPError as e:
abort(e.code)
soup = BeautifulSoup(data.text, "html.parser") soup = BeautifulSoup(data.read().decode(), "html.parser")
main = soup.select("div.sitemap-content")[0] main = soup.select("div.sitemap-content")[0]
groups = [] group_section = main.select("div.group-section")
for group in main.select("div.group-section"):
category = group.select("h2 a")[0].text if group_section:
category_link = group.select("h2 a")[0].get("href") groups = []
for group in group_section:
category = group.select("h2 a")[0].text
category_link = group.select("h2 a")[0].get("href")
channels = []
for li in group.select("ul.sitemap-listing li"):
channel = li.a.text
channel_link = li.a["href"]
channels.append([channel, channel_link])
groups.append([category, category_link, channels])
else:
groups = []
channels = [] channels = []
for li in group.select("ul.sitemap-listing li"): for li in main.select("ul.sitemap-listing li"):
channel = li.a.text channel = li.a.text
channel_link = li.a["href"] channel_link = li.a["href"]
channels.append([channel, channel_link]) channels.append([channel, channel_link])
groups.append([category, category_link, channels]) groups.append(["", "", channels])
return render_template("sitemap.html", data=groups) return render_template("sitemap.html", data=groups)
@ -444,11 +455,13 @@ def route_contest_archive():
page = 1 page = 1
if request.args.get("page") != None: if request.args.get("page") != None:
page = request.args.get("page") page = request.args.get("page")
data = requests.get(f"https://www.instructables.com/contest/archive/?page={page}")
if data.status_code != 200:
abort(data.status_code)
soup = BeautifulSoup(data.text, "html.parser") try:
data = urlopen(f"https://www.instructables.com/contest/archive/?page={page}")
except HTTPError as e:
abort(e.code)
soup = BeautifulSoup(data.read().decode(), "html.parser")
main = soup.select("div#contest-archive-wrapper")[0] main = soup.select("div#contest-archive-wrapper")[0]
@ -481,11 +494,12 @@ def route_contest_archive():
@app.route("/contest/<contest>/") @app.route("/contest/<contest>/")
def route_contest(contest): def route_contest(contest):
data = requests.get(f"https://www.instructables.com/contest/{contest}/") try:
if data.status_code != 200: data = urlopen(f"https://www.instructables.com/contest/{contest}/")
abort(data.status_code) except HTTPError as e:
abort(e.code)
soup = BeautifulSoup(data.text, "html.parser") soup = BeautifulSoup(data.read().decode(), "html.parser")
title = soup.select('meta[property="og:title"]')[0].get("content") title = soup.select('meta[property="og:title"]')[0].get("content")
@ -500,7 +514,7 @@ def route_contest(contest):
info.select("div#site-announcements-page")[0].decompose() info.select("div#site-announcements-page")[0].decompose()
info.select("h3")[0].decompose() info.select("h3")[0].decompose()
info.select("div#contest-body-nav")[0].decompose() info.select("div#contest-body-nav")[0].decompose()
info = str(info).replace("https://www.instructables.com", "{instance_root_url}") info = str(info).replace("https://www.instructables.com", "/")
entries = body.select("span.contest-entity-count")[0].text entries = body.select("span.contest-entity-count")[0].text
@ -535,11 +549,12 @@ def route_contest(contest):
@app.route("/contest/") @app.route("/contest/")
def route_contests(): def route_contests():
data = requests.get("https://www.instructables.com/contest/") try:
if data.status_code != 200: data = urlopen("https://www.instructables.com/contest/")
abort(data.status_code) except HTTPError as e:
abort(e.code)
soup = BeautifulSoup(data.text, "html.parser") soup = BeautifulSoup(data.read().decode(), "html.parser")
contest_count = str(soup.select("p.contest-count")[0]) contest_count = str(soup.select("p.contest-count")[0])
@ -660,11 +675,12 @@ def route_sitemap_circuits(category, subcategory):
@app.route("/member/<member>/instructables/") @app.route("/member/<member>/instructables/")
def route_member_instructables(member): def route_member_instructables(member):
data = requests.get(f"https://www.instructables.com/member/{member}/instructables") try:
if data.status_code != 200: data = urlopen(f"https://www.instructables.com/member/{member}/instructables/")
abort(data.status_code) except HTTPError as e:
abort(e.code)
soup = BeautifulSoup(data.text, "html.parser") soup = BeautifulSoup(data.read().decode(), "html.parser")
header = soup.select(".profile-header.profile-header-social")[0] header = soup.select(".profile-header.profile-header-social")[0]
header_content = member_header(header) header_content = member_header(header)
@ -697,13 +713,16 @@ def route_member(member):
"User-Agent": "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/113.0" "User-Agent": "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/113.0"
} }
data = requests.get( request = Request(
f"https://www.instructables.com/member/{member}/", headers=headers f"https://www.instructables.com/member/{member}/", headers=headers
) )
if data.status_code != 200:
abort(data.status_code)
soup = BeautifulSoup(data.text, "html.parser") try:
data = urlopen(request)
except HTTPError as e:
abort(e.code)
soup = BeautifulSoup(data.read().decode(), "html.parser")
header_content = member_header(soup) header_content = member_header(soup)
@ -753,11 +772,12 @@ def route_member(member):
@app.route("/<article>/") @app.route("/<article>/")
def route_article(article): def route_article(article):
data = requests.get(f"https://www.instructables.com/{article}/") try:
if data.status_code != 200: data = urlopen(f"https://www.instructables.com/{article}/")
abort(data.status_code) except HTTPError as e:
abort(e.code)
soup = BeautifulSoup(data.text, "html.parser") soup = BeautifulSoup(data.read().decode(), "html.parser")
try: try:
header = soup.select("header") header = soup.select("header")
@ -800,9 +820,7 @@ def route_article(article):
step_text = str(step.select("div.step-body")[0]) step_text = str(step.select("div.step-body")[0])
step_text = step_text.replace( step_text = step_text.replace(
"https://content.instructables.com", "https://content.instructables.com",
"{instance_root_url}/proxy/?url=https://content.instructables.com".format( "/proxy/?url=https://content.instructables.com",
instance_root_url=get_instance_root_url(request)
),
) )
steps.append([step_title, step_imgs, step_text, step_videos]) steps.append([step_title, step_imgs, step_text, step_videos])
@ -941,6 +959,7 @@ def route_article(article):
@app.route("/<category>/<channel>/") @app.route("/<category>/<channel>/")
def route_channel_redirect(category, channel): def route_channel_redirect(category, channel):
# TODO: Just check if the channel exists
if ( if (
category == "circuits" category == "circuits"
or category == "workshop" or category == "workshop"
@ -957,11 +976,12 @@ def route_channel_redirect(category, channel):
@app.route("/") @app.route("/")
def route_explore(): def route_explore():
data = requests.get("https://www.instructables.com/") try:
if data.status_code != 200: data = urlopen("https://www.instructables.com/")
abort(data.status_code) except HTTPError as e:
abort(e.code)
soup = BeautifulSoup(data.text, "html.parser") soup = BeautifulSoup(data.read().decode(), "html.parser")
explore = soup.select(".home-content-explore-wrap")[0] explore = soup.select(".home-content-explore-wrap")[0]
@ -994,32 +1014,43 @@ def route_proxy():
if url.startswith("https://cdn.instructables.com/") or url.startswith( if url.startswith("https://cdn.instructables.com/") or url.startswith(
"https://content.instructables.com/" "https://content.instructables.com/"
): ):
data = requests.get(unquote(url)) try:
return Response(data.content, content_type=data.headers["content-type"]) data = urlopen(unquote(url))
except HTTPError as e:
abort(e.code)
return Response(data.read(), content_type=data.headers["content-type"])
else: else:
raise BadRequest() raise BadRequest()
else: else:
raise BadRequest() raise BadRequest()
@app.route("/privacypolicy/") @app.route("/privacypolicy/")
def privacypolicy(): def privacypolicy():
# TODO: Make this dynamic
return render_template("privacypolicy.html") return render_template("privacypolicy.html")
@app.errorhandler(404) @app.errorhandler(404)
def not_found(e): def not_found(e):
return render_template("404.html") return render_template("404.html")
@app.errorhandler(400) @app.errorhandler(400)
def bad_request(e): def bad_request(e):
return render_template("400.html") return render_template("400.html")
@app.errorhandler(429) @app.errorhandler(429)
def too_many_requests(e): def too_many_requests(e):
return render_template("429.html") return render_template("429.html")
@app.errorhandler(500) @app.errorhandler(500)
def internal_server_error(e): def internal_server_error(e):
return render_template("500.html") return render_template("500.html")
if __name__ == '__main__':
if __name__ == "__main__":
app.run(port=args.port, host=args.listen_host, debug=debugmode) app.run(port=args.port, host=args.listen_host, debug=debugmode)

View file

@ -1,5 +1,2 @@
bs4 bs4
requests
flask flask
requests-html
playwright