Refactor data fetching logic using stdlib
Removed dependencies on external libraries such as `requests`, `requests-html`, and `playwright` in favor of Python's standard libraries like `urllib` for HTTP requests and parsing functionality. A more robust and efficient data update function was introduced to scrape API keys and fetch project data using Typesense. Transitioned from a manual browser-based scraping approach to an API-based one for improved stability and performance. Added logging for better monitoring and debuggability. Error-handling now leverages `HTTPError` from `urllib.error`. Shifted the configuration of debug mode to rely on an environment variable, aligning with Twelve-Factor principles. Removed unused functions and streamlined the handling of various routes within the Flask app. This change shifts the project towards a more maintainable code base by using built-in libraries, reduces external dependencies, and improves resilience and scalability of the web scraping components.
This commit is contained in:
parent
d269a9992e
commit
29592de90a
2 changed files with 201 additions and 173 deletions
369
main.py
369
main.py
|
@ -9,90 +9,121 @@ from flask import (
|
||||||
stream_with_context,
|
stream_with_context,
|
||||||
)
|
)
|
||||||
|
|
||||||
import requests
|
|
||||||
import re
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from urllib.parse import quote, unquote
|
from urllib.parse import quote, unquote
|
||||||
|
from urllib.request import Request, urlopen
|
||||||
|
from urllib.error import HTTPError
|
||||||
from traceback import print_exc
|
from traceback import print_exc
|
||||||
from requests_html import HTMLSession
|
from urllib.parse import urljoin, urlparse
|
||||||
from playwright.sync_api import sync_playwright
|
|
||||||
from urllib.parse import urljoin
|
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
|
from configparser import ConfigParser
|
||||||
|
|
||||||
from werkzeug.exceptions import BadRequest, abort, InternalServerError, NotFound
|
from werkzeug.exceptions import BadRequest, abort, InternalServerError, NotFound
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
global_ibles = {}
|
global_ibles = {}
|
||||||
|
|
||||||
def proxy(src):
|
|
||||||
return "/proxy/?url=" + quote(str(src))
|
def proxy(url):
|
||||||
|
logging.debug(f"Generating proxy URL for {url}")
|
||||||
|
return f"/proxy/?url={url}"
|
||||||
|
|
||||||
|
|
||||||
|
def get_typesense_api_key():
|
||||||
|
logging.debug("Getting Typesense API key...")
|
||||||
|
|
||||||
|
data = urlopen("https://www.instructables.com/")
|
||||||
|
soup = BeautifulSoup(data.read().decode(), "html.parser")
|
||||||
|
scripts = soup.select("script")
|
||||||
|
|
||||||
|
for script in scripts:
|
||||||
|
if "typesense" in script.text and (
|
||||||
|
matches := re.search(r'"typesenseApiKey":\s?"(.*?)"', script.text)
|
||||||
|
):
|
||||||
|
api_key = matches.group(1)
|
||||||
|
logging.debug(f"Identified Typesense API key as {api_key}")
|
||||||
|
return api_key
|
||||||
|
|
||||||
|
logging.error("Failed to get Typesense API key")
|
||||||
|
|
||||||
|
|
||||||
|
TYPESENSE_API_KEY = get_typesense_api_key()
|
||||||
|
|
||||||
|
|
||||||
|
def projects_search(
|
||||||
|
query="*",
|
||||||
|
category="",
|
||||||
|
channel="",
|
||||||
|
filter_by="featureFlag:=true",
|
||||||
|
page=1,
|
||||||
|
per_page=50,
|
||||||
|
):
|
||||||
|
if category:
|
||||||
|
if filter_by:
|
||||||
|
filter_by += " && "
|
||||||
|
filter_by += f"category:={category}"
|
||||||
|
|
||||||
|
if channel:
|
||||||
|
if filter_by:
|
||||||
|
filter_by += " && "
|
||||||
|
filter_by += f"channel:={channel}"
|
||||||
|
|
||||||
|
query = quote(query)
|
||||||
|
filter_by = quote(filter_by)
|
||||||
|
|
||||||
|
logging.debug(f"Searching projects with query {query} and filter {filter_by}")
|
||||||
|
|
||||||
|
projects_headers = {"x-typesense-api-key": TYPESENSE_API_KEY}
|
||||||
|
projects_request = Request(
|
||||||
|
f"https://www.instructables.com/api_proxy/search/collections/projects/documents/search?q={query}&query_by=title,stepBody,screenName&page={page}&sort_by=publishDate:desc&include_fields=title,urlString,coverImageUrl,screenName,favorites,views,primaryClassification,featureFlag,prizeLevel,IMadeItCount&filter_by={filter_by}&per_page={per_page}",
|
||||||
|
headers=projects_headers,
|
||||||
|
)
|
||||||
|
|
||||||
|
projects_data = urlopen(projects_request)
|
||||||
|
project_obj = json.loads(projects_data.read().decode())
|
||||||
|
project_ibles = project_obj["hits"]
|
||||||
|
|
||||||
|
logging.debug(f"Got {len(project_ibles)} projects")
|
||||||
|
|
||||||
|
return project_ibles
|
||||||
|
|
||||||
|
|
||||||
def update_data():
|
def update_data():
|
||||||
playwright = sync_playwright().start()
|
logging.debug("Updating data...")
|
||||||
browser = playwright.chromium.launch(headless=True)
|
|
||||||
page = browser.new_page()
|
|
||||||
|
|
||||||
channels = []
|
channels = []
|
||||||
|
|
||||||
data = requests.get(f"https://www.instructables.com/sitemap/")
|
sitemap_data = urlopen("https://www.instructables.com/sitemap/")
|
||||||
|
sitemap_soup = BeautifulSoup(sitemap_data.read().decode(), "html.parser")
|
||||||
soup = BeautifulSoup(data.text, "html.parser")
|
main = sitemap_soup.select("div.sitemap-content")[0]
|
||||||
|
|
||||||
main = soup.select("div.sitemap-content")[0]
|
|
||||||
|
|
||||||
groups = []
|
groups = []
|
||||||
for group in main.select("div.group-section"):
|
for group in main.select("div.group-section"):
|
||||||
channels.append(group.select("h2 a")[0].text.lower())
|
channels.append(group.select("h2 a")[0].text.lower())
|
||||||
|
|
||||||
global_ibles["/projects"] = []
|
global_ibles["/projects"] = []
|
||||||
|
project_ibles = projects_search()
|
||||||
page.goto("https://www.instructables.com/projects")
|
|
||||||
|
|
||||||
while len(global_ibles["/projects"]) <= 0:
|
while len(global_ibles["/projects"]) <= 0:
|
||||||
for ible in page.query_selector_all(".ibleCard__QPJVm"):
|
for ible in project_ibles:
|
||||||
link = (
|
link = f"/{ible['document']['urlString']}"
|
||||||
ible.query_selector("a")
|
img = proxy(ible["document"]["coverImageUrl"])
|
||||||
.get_attribute("href")
|
|
||||||
.replace("https://www.instructables.com", "{instance_root_url}")
|
|
||||||
)
|
|
||||||
img = proxy(ible.query_selector("img").get_attribute("src"))
|
|
||||||
|
|
||||||
title = ible.query_selector(".title__t0fGQ").inner_text()
|
title = ible["document"]["title"]
|
||||||
author = ible.query_selector("a[href^='/member/']").inner_text()
|
author = ible["document"]["screenName"]
|
||||||
author_link = (
|
author_link = f"/member/{author}"
|
||||||
ible.query_selector("a[href^='/member/']")
|
|
||||||
.get_attribute("href")
|
|
||||||
.replace("https://www.instructables.com", "{instance_root_url}")
|
|
||||||
)
|
|
||||||
|
|
||||||
for c in channels:
|
channel = ible["document"]["primaryClassification"]
|
||||||
try:
|
channel_link = f"/channel/{channel}"
|
||||||
channel = ible.query_selector("a[href^='/" + c + "']").inner_text()
|
|
||||||
channel_link = (
|
|
||||||
ible.query_selector("a[href^='/" + c + "']")
|
|
||||||
.get_attribute("href")
|
|
||||||
.replace("https://www.instructables.com", "{instance_root_url}")
|
|
||||||
)
|
|
||||||
except:
|
|
||||||
try:
|
|
||||||
channel = ible.query_selector("a[href^='/projects/']").inner_text()
|
|
||||||
channel_link = (
|
|
||||||
ible.query_selector("a[href^='/projects/']")
|
|
||||||
.get_attribute("href")
|
|
||||||
.replace("https://www.instructables.com", "{instance_root_url}")
|
|
||||||
)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
stats = ible.query_selector(".stats__GFKyl")
|
views = ible["document"]["views"]
|
||||||
views = 0
|
favorites = ible["document"]["favorites"]
|
||||||
if stats.query_selector("div[title$=' views']"):
|
|
||||||
views = stats.query_selector("div[title$=' views']").inner_text()
|
|
||||||
favorites = 0
|
|
||||||
if stats.query_selector("div[title$=' favorites']"):
|
|
||||||
favorites = stats.query_selector("div[title$=' favorites']").inner_text()
|
|
||||||
|
|
||||||
global_ibles["/projects"].append(
|
global_ibles["/projects"].append(
|
||||||
[
|
[
|
||||||
|
@ -108,10 +139,8 @@ def update_data():
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
browser.close()
|
|
||||||
playwright.stop()
|
|
||||||
|
|
||||||
debugmode = False
|
debugmode = os.environ.get("FLASK_DEBUG", False)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = ArgumentParser()
|
parser = ArgumentParser()
|
||||||
|
@ -147,14 +176,16 @@ print("Started!")
|
||||||
|
|
||||||
app = Flask(__name__, template_folder="templates", static_folder="static")
|
app = Flask(__name__, template_folder="templates", static_folder="static")
|
||||||
|
|
||||||
def get_instance_root_url(request):
|
if debugmode:
|
||||||
return request.url_root
|
app.logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/cron/")
|
@app.route("/cron/")
|
||||||
def cron():
|
def cron():
|
||||||
update_data()
|
update_data()
|
||||||
return "OK"
|
return "OK"
|
||||||
|
|
||||||
|
|
||||||
def explore_lists(soup):
|
def explore_lists(soup):
|
||||||
list_ = []
|
list_ = []
|
||||||
for ible in soup.select(".home-content-explore-ible"):
|
for ible in soup.select(".home-content-explore-ible"):
|
||||||
|
@ -199,8 +230,6 @@ def member_header(header):
|
||||||
|
|
||||||
profile_top = header.select("div.profile-top")[0]
|
profile_top = header.select("div.profile-top")[0]
|
||||||
|
|
||||||
print(header.encode_contents())
|
|
||||||
|
|
||||||
# stats_text = profile_top.select("div.profile-header-stats")[0]
|
# stats_text = profile_top.select("div.profile-header-stats")[0]
|
||||||
# stats_num = header.select("div.profile-top div.profile-header-stats")[1]
|
# stats_num = header.select("div.profile-top div.profile-header-stats")[1]
|
||||||
|
|
||||||
|
@ -260,11 +289,12 @@ def member_header(header):
|
||||||
|
|
||||||
|
|
||||||
def category_page(path, name, teachers=False):
|
def category_page(path, name, teachers=False):
|
||||||
data = requests.get("https://www.instructables.com" + path)
|
try:
|
||||||
if data.status_code != 200:
|
data = urlopen("https://www.instructables.com" + path)
|
||||||
abort(data.status_code)
|
except HTTPError as e:
|
||||||
|
abort(e.code)
|
||||||
|
|
||||||
soup = BeautifulSoup(data.text, "html.parser")
|
soup = BeautifulSoup(data.read().decode(), "html.parser")
|
||||||
|
|
||||||
channels = []
|
channels = []
|
||||||
for card in soup.select("div.scrollable-cards-inner div.scrollable-card"):
|
for card in soup.select("div.scrollable-cards-inner div.scrollable-card"):
|
||||||
|
@ -328,70 +358,39 @@ def category_page(path, name, teachers=False):
|
||||||
|
|
||||||
|
|
||||||
def project_list(path, head, sort=""):
|
def project_list(path, head, sort=""):
|
||||||
playwright = sync_playwright().start()
|
|
||||||
browser = playwright.chromium.launch(headless=True)
|
|
||||||
page = browser.new_page()
|
|
||||||
|
|
||||||
page.goto(urljoin("https://www.instructables.com", path))
|
|
||||||
|
|
||||||
head = f"{head + ' ' if head != '' else ''}Projects" + sort
|
head = f"{head + ' ' if head != '' else ''}Projects" + sort
|
||||||
path_ = path.rsplit("/", 1)[0]
|
path = urlparse(path).path
|
||||||
|
|
||||||
if path == "/projects/" or path == "/projects":
|
if path in ("/projects/", "/projects"):
|
||||||
ibles = global_ibles["/projects"]
|
ibles = global_ibles["/projects"]
|
||||||
else:
|
else:
|
||||||
|
if not "projects" in path.split("/"):
|
||||||
|
abort(404)
|
||||||
|
|
||||||
ibles = []
|
ibles = []
|
||||||
|
|
||||||
for ible in page.query_selector_all(".ibleCard__QPJVm"):
|
parts = path.split("/")
|
||||||
link = (
|
|
||||||
ible.query_selector("a")
|
|
||||||
.get_attribute("href")
|
|
||||||
.replace("https://www.instructables.com", "{instance_root_url}")
|
|
||||||
)
|
|
||||||
img = proxy(
|
|
||||||
ible.find_elements(By.CSS_SELECTOR, "img")[0].get_attribute("src")
|
|
||||||
)
|
|
||||||
|
|
||||||
title = ible.find_elements(By.CLASS_NAME, "title__t0fGQ")[0].text
|
category = parts[1]
|
||||||
author = ible.find_elements(By.CSS_SELECTOR, "a[href^='/member/']")[0].text
|
channel = "" if parts[2] == "projects" else parts[2]
|
||||||
author_link = (
|
|
||||||
ible.find_elements(By.CSS_SELECTOR, "a[href^='/member/']")[0]
|
|
||||||
.get_attribute("href")
|
|
||||||
.replace("https://www.instructables.com", "{instance_root_url}")
|
|
||||||
)
|
|
||||||
|
|
||||||
channel = "TEST"
|
# TODO: Add pagination, popular, etc.
|
||||||
channel_link = "TEST"
|
|
||||||
|
|
||||||
for c in channels:
|
project_ibles = projects_search(category=category, channel=channel)
|
||||||
try:
|
|
||||||
channel = ible.query_selector("a[href^='/" + c + "']").inner_text()
|
|
||||||
channel_link = (
|
|
||||||
ible.query_selector("a[href^='/" + c + "']")
|
|
||||||
.get_attribute("href")
|
|
||||||
.replace("https://www.instructables.com", "{instance_root_url}")
|
|
||||||
)
|
|
||||||
except:
|
|
||||||
try:
|
|
||||||
channel = ible.query_selector("a[href^='/projects/'] span").inner_text()
|
|
||||||
channel_link = (
|
|
||||||
ible.query_selector("a[href^='/projects/']")
|
|
||||||
.get_attribute("href")
|
|
||||||
.replace("https://www.instructables.com", "{instance_root_url}")
|
|
||||||
)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
stats = ible.query_selector(".stats__GFKyl")
|
for ible in project_ibles:
|
||||||
views = 0
|
link = f"/{ible['document']['urlString']}"
|
||||||
|
img = proxy(ible["document"]["coverImageUrl"])
|
||||||
|
|
||||||
if stats.query_selector("div[title$=' views']"):
|
title = ible["document"]["title"]
|
||||||
views = stats.query_selector("div[title$=' views']").inner_text()
|
author = ible["document"]["screenName"]
|
||||||
|
author_link = f"/member/{author}"
|
||||||
|
|
||||||
favorites = 0
|
channel = ible["document"]["primaryClassification"]
|
||||||
|
channel_link = f"/channel/{channel}"
|
||||||
|
|
||||||
if stats.query_selector("div[title$=' favorites']"):
|
views = ible["document"]["views"]
|
||||||
favorites = stats.query_selector("div[title$=' favorites']").inner_text()
|
favorites = ible["document"]["favorites"]
|
||||||
|
|
||||||
ibles.append(
|
ibles.append(
|
||||||
[
|
[
|
||||||
|
@ -410,31 +409,43 @@ def project_list(path, head, sort=""):
|
||||||
if len(ibles) >= 8:
|
if len(ibles) >= 8:
|
||||||
break
|
break
|
||||||
|
|
||||||
browser.close()
|
return render_template("projects.html", data=[head, ibles, path])
|
||||||
playwright.stop()
|
|
||||||
|
|
||||||
return render_template("projects.html", data=[head, ibles, path_])
|
|
||||||
|
|
||||||
@app.route("/sitemap/")
|
@app.route("/sitemap/")
|
||||||
def route_sitemap():
|
@app.route("/sitemap/<path:path>")
|
||||||
data = requests.get(f"https://www.instructables.com/sitemap/")
|
def route_sitemap(path=""):
|
||||||
if data.status_code != 200:
|
try:
|
||||||
abort(data.status_code)
|
data = urlopen("https://www.instructables.com/sitemap/" + path)
|
||||||
|
except HTTPError as e:
|
||||||
|
abort(e.code)
|
||||||
|
|
||||||
soup = BeautifulSoup(data.text, "html.parser")
|
soup = BeautifulSoup(data.read().decode(), "html.parser")
|
||||||
|
|
||||||
main = soup.select("div.sitemap-content")[0]
|
main = soup.select("div.sitemap-content")[0]
|
||||||
|
|
||||||
groups = []
|
group_section = main.select("div.group-section")
|
||||||
for group in main.select("div.group-section"):
|
|
||||||
category = group.select("h2 a")[0].text
|
if group_section:
|
||||||
category_link = group.select("h2 a")[0].get("href")
|
groups = []
|
||||||
|
for group in group_section:
|
||||||
|
category = group.select("h2 a")[0].text
|
||||||
|
category_link = group.select("h2 a")[0].get("href")
|
||||||
|
channels = []
|
||||||
|
for li in group.select("ul.sitemap-listing li"):
|
||||||
|
channel = li.a.text
|
||||||
|
channel_link = li.a["href"]
|
||||||
|
channels.append([channel, channel_link])
|
||||||
|
groups.append([category, category_link, channels])
|
||||||
|
|
||||||
|
else:
|
||||||
|
groups = []
|
||||||
channels = []
|
channels = []
|
||||||
for li in group.select("ul.sitemap-listing li"):
|
for li in main.select("ul.sitemap-listing li"):
|
||||||
channel = li.a.text
|
channel = li.a.text
|
||||||
channel_link = li.a["href"]
|
channel_link = li.a["href"]
|
||||||
channels.append([channel, channel_link])
|
channels.append([channel, channel_link])
|
||||||
groups.append([category, category_link, channels])
|
groups.append(["", "", channels])
|
||||||
|
|
||||||
return render_template("sitemap.html", data=groups)
|
return render_template("sitemap.html", data=groups)
|
||||||
|
|
||||||
|
@ -444,11 +455,13 @@ def route_contest_archive():
|
||||||
page = 1
|
page = 1
|
||||||
if request.args.get("page") != None:
|
if request.args.get("page") != None:
|
||||||
page = request.args.get("page")
|
page = request.args.get("page")
|
||||||
data = requests.get(f"https://www.instructables.com/contest/archive/?page={page}")
|
|
||||||
if data.status_code != 200:
|
|
||||||
abort(data.status_code)
|
|
||||||
|
|
||||||
soup = BeautifulSoup(data.text, "html.parser")
|
try:
|
||||||
|
data = urlopen(f"https://www.instructables.com/contest/archive/?page={page}")
|
||||||
|
except HTTPError as e:
|
||||||
|
abort(e.code)
|
||||||
|
|
||||||
|
soup = BeautifulSoup(data.read().decode(), "html.parser")
|
||||||
|
|
||||||
main = soup.select("div#contest-archive-wrapper")[0]
|
main = soup.select("div#contest-archive-wrapper")[0]
|
||||||
|
|
||||||
|
@ -481,11 +494,12 @@ def route_contest_archive():
|
||||||
|
|
||||||
@app.route("/contest/<contest>/")
|
@app.route("/contest/<contest>/")
|
||||||
def route_contest(contest):
|
def route_contest(contest):
|
||||||
data = requests.get(f"https://www.instructables.com/contest/{contest}/")
|
try:
|
||||||
if data.status_code != 200:
|
data = urlopen(f"https://www.instructables.com/contest/{contest}/")
|
||||||
abort(data.status_code)
|
except HTTPError as e:
|
||||||
|
abort(e.code)
|
||||||
|
|
||||||
soup = BeautifulSoup(data.text, "html.parser")
|
soup = BeautifulSoup(data.read().decode(), "html.parser")
|
||||||
|
|
||||||
title = soup.select('meta[property="og:title"]')[0].get("content")
|
title = soup.select('meta[property="og:title"]')[0].get("content")
|
||||||
|
|
||||||
|
@ -500,7 +514,7 @@ def route_contest(contest):
|
||||||
info.select("div#site-announcements-page")[0].decompose()
|
info.select("div#site-announcements-page")[0].decompose()
|
||||||
info.select("h3")[0].decompose()
|
info.select("h3")[0].decompose()
|
||||||
info.select("div#contest-body-nav")[0].decompose()
|
info.select("div#contest-body-nav")[0].decompose()
|
||||||
info = str(info).replace("https://www.instructables.com", "{instance_root_url}")
|
info = str(info).replace("https://www.instructables.com", "/")
|
||||||
|
|
||||||
entries = body.select("span.contest-entity-count")[0].text
|
entries = body.select("span.contest-entity-count")[0].text
|
||||||
|
|
||||||
|
@ -535,11 +549,12 @@ def route_contest(contest):
|
||||||
|
|
||||||
@app.route("/contest/")
|
@app.route("/contest/")
|
||||||
def route_contests():
|
def route_contests():
|
||||||
data = requests.get("https://www.instructables.com/contest/")
|
try:
|
||||||
if data.status_code != 200:
|
data = urlopen("https://www.instructables.com/contest/")
|
||||||
abort(data.status_code)
|
except HTTPError as e:
|
||||||
|
abort(e.code)
|
||||||
|
|
||||||
soup = BeautifulSoup(data.text, "html.parser")
|
soup = BeautifulSoup(data.read().decode(), "html.parser")
|
||||||
|
|
||||||
contest_count = str(soup.select("p.contest-count")[0])
|
contest_count = str(soup.select("p.contest-count")[0])
|
||||||
|
|
||||||
|
@ -660,11 +675,12 @@ def route_sitemap_circuits(category, subcategory):
|
||||||
|
|
||||||
@app.route("/member/<member>/instructables/")
|
@app.route("/member/<member>/instructables/")
|
||||||
def route_member_instructables(member):
|
def route_member_instructables(member):
|
||||||
data = requests.get(f"https://www.instructables.com/member/{member}/instructables")
|
try:
|
||||||
if data.status_code != 200:
|
data = urlopen(f"https://www.instructables.com/member/{member}/instructables/")
|
||||||
abort(data.status_code)
|
except HTTPError as e:
|
||||||
|
abort(e.code)
|
||||||
|
|
||||||
soup = BeautifulSoup(data.text, "html.parser")
|
soup = BeautifulSoup(data.read().decode(), "html.parser")
|
||||||
|
|
||||||
header = soup.select(".profile-header.profile-header-social")[0]
|
header = soup.select(".profile-header.profile-header-social")[0]
|
||||||
header_content = member_header(header)
|
header_content = member_header(header)
|
||||||
|
@ -697,13 +713,16 @@ def route_member(member):
|
||||||
"User-Agent": "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/113.0"
|
"User-Agent": "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/113.0"
|
||||||
}
|
}
|
||||||
|
|
||||||
data = requests.get(
|
request = Request(
|
||||||
f"https://www.instructables.com/member/{member}/", headers=headers
|
f"https://www.instructables.com/member/{member}/", headers=headers
|
||||||
)
|
)
|
||||||
if data.status_code != 200:
|
|
||||||
abort(data.status_code)
|
|
||||||
|
|
||||||
soup = BeautifulSoup(data.text, "html.parser")
|
try:
|
||||||
|
data = urlopen(request)
|
||||||
|
except HTTPError as e:
|
||||||
|
abort(e.code)
|
||||||
|
|
||||||
|
soup = BeautifulSoup(data.read().decode(), "html.parser")
|
||||||
|
|
||||||
header_content = member_header(soup)
|
header_content = member_header(soup)
|
||||||
|
|
||||||
|
@ -753,11 +772,12 @@ def route_member(member):
|
||||||
|
|
||||||
@app.route("/<article>/")
|
@app.route("/<article>/")
|
||||||
def route_article(article):
|
def route_article(article):
|
||||||
data = requests.get(f"https://www.instructables.com/{article}/")
|
try:
|
||||||
if data.status_code != 200:
|
data = urlopen(f"https://www.instructables.com/{article}/")
|
||||||
abort(data.status_code)
|
except HTTPError as e:
|
||||||
|
abort(e.code)
|
||||||
|
|
||||||
soup = BeautifulSoup(data.text, "html.parser")
|
soup = BeautifulSoup(data.read().decode(), "html.parser")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
header = soup.select("header")
|
header = soup.select("header")
|
||||||
|
@ -800,9 +820,7 @@ def route_article(article):
|
||||||
step_text = str(step.select("div.step-body")[0])
|
step_text = str(step.select("div.step-body")[0])
|
||||||
step_text = step_text.replace(
|
step_text = step_text.replace(
|
||||||
"https://content.instructables.com",
|
"https://content.instructables.com",
|
||||||
"{instance_root_url}/proxy/?url=https://content.instructables.com".format(
|
"/proxy/?url=https://content.instructables.com",
|
||||||
instance_root_url=get_instance_root_url(request)
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
steps.append([step_title, step_imgs, step_text, step_videos])
|
steps.append([step_title, step_imgs, step_text, step_videos])
|
||||||
|
|
||||||
|
@ -941,6 +959,7 @@ def route_article(article):
|
||||||
|
|
||||||
@app.route("/<category>/<channel>/")
|
@app.route("/<category>/<channel>/")
|
||||||
def route_channel_redirect(category, channel):
|
def route_channel_redirect(category, channel):
|
||||||
|
# TODO: Just check if the channel exists
|
||||||
if (
|
if (
|
||||||
category == "circuits"
|
category == "circuits"
|
||||||
or category == "workshop"
|
or category == "workshop"
|
||||||
|
@ -957,11 +976,12 @@ def route_channel_redirect(category, channel):
|
||||||
|
|
||||||
@app.route("/")
|
@app.route("/")
|
||||||
def route_explore():
|
def route_explore():
|
||||||
data = requests.get("https://www.instructables.com/")
|
try:
|
||||||
if data.status_code != 200:
|
data = urlopen("https://www.instructables.com/")
|
||||||
abort(data.status_code)
|
except HTTPError as e:
|
||||||
|
abort(e.code)
|
||||||
|
|
||||||
soup = BeautifulSoup(data.text, "html.parser")
|
soup = BeautifulSoup(data.read().decode(), "html.parser")
|
||||||
|
|
||||||
explore = soup.select(".home-content-explore-wrap")[0]
|
explore = soup.select(".home-content-explore-wrap")[0]
|
||||||
|
|
||||||
|
@ -994,32 +1014,43 @@ def route_proxy():
|
||||||
if url.startswith("https://cdn.instructables.com/") or url.startswith(
|
if url.startswith("https://cdn.instructables.com/") or url.startswith(
|
||||||
"https://content.instructables.com/"
|
"https://content.instructables.com/"
|
||||||
):
|
):
|
||||||
data = requests.get(unquote(url))
|
try:
|
||||||
return Response(data.content, content_type=data.headers["content-type"])
|
data = urlopen(unquote(url))
|
||||||
|
except HTTPError as e:
|
||||||
|
abort(e.code)
|
||||||
|
|
||||||
|
return Response(data.read(), content_type=data.headers["content-type"])
|
||||||
else:
|
else:
|
||||||
raise BadRequest()
|
raise BadRequest()
|
||||||
else:
|
else:
|
||||||
raise BadRequest()
|
raise BadRequest()
|
||||||
|
|
||||||
|
|
||||||
@app.route("/privacypolicy/")
|
@app.route("/privacypolicy/")
|
||||||
def privacypolicy():
|
def privacypolicy():
|
||||||
|
# TODO: Make this dynamic
|
||||||
return render_template("privacypolicy.html")
|
return render_template("privacypolicy.html")
|
||||||
|
|
||||||
|
|
||||||
@app.errorhandler(404)
|
@app.errorhandler(404)
|
||||||
def not_found(e):
|
def not_found(e):
|
||||||
return render_template("404.html")
|
return render_template("404.html")
|
||||||
|
|
||||||
|
|
||||||
@app.errorhandler(400)
|
@app.errorhandler(400)
|
||||||
def bad_request(e):
|
def bad_request(e):
|
||||||
return render_template("400.html")
|
return render_template("400.html")
|
||||||
|
|
||||||
|
|
||||||
@app.errorhandler(429)
|
@app.errorhandler(429)
|
||||||
def too_many_requests(e):
|
def too_many_requests(e):
|
||||||
return render_template("429.html")
|
return render_template("429.html")
|
||||||
|
|
||||||
|
|
||||||
@app.errorhandler(500)
|
@app.errorhandler(500)
|
||||||
def internal_server_error(e):
|
def internal_server_error(e):
|
||||||
return render_template("500.html")
|
return render_template("500.html")
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
|
if __name__ == "__main__":
|
||||||
app.run(port=args.port, host=args.listen_host, debug=debugmode)
|
app.run(port=args.port, host=args.listen_host, debug=debugmode)
|
||||||
|
|
|
@ -1,5 +1,2 @@
|
||||||
bs4
|
bs4
|
||||||
requests
|
|
||||||
flask
|
flask
|
||||||
requests-html
|
|
||||||
playwright
|
|
||||||
|
|
Loading…
Reference in a new issue