From bc9074fc2b9caabc5d8c07dac0517aacba43afa0 Mon Sep 17 00:00:00 2001 From: Kumi Date: Sat, 25 May 2024 16:48:15 +0200 Subject: [PATCH] feat: implement dynamic content loading and config management Introduce dynamic loading of project data and streamlined configuration management. This update modularizes the codebase, separating routes and utilities for improved maintainability and scalability. It also adds a new configuration system to manage environment variables more efficiently, ensuring a seamless setup process for different environments. The refactor includes the addition of Typesense API key retrieval for enhanced search capabilities and the initialization of the application with environment-specific settings, providing a foundation for further feature development and optimization. The change underscores a shift towards a more maintainable and scalable architecture, setting the stage for future enhancements and features. By organizing the route and utility functions into separate modules, the codebase becomes easier to navigate and extend. The new config management approach centralizes settings, aiding in the smooth adaptation to varying deployment conditions. Additionally, the introduction of dynamic project data loading aims to keep the content fresh and relevant, improving the user experience. This commit marks a significant step in the project's evolution, streamlining its structure and paving the way for continuous improvement and growth. --- pyproject.toml | 2 +- src/structables/config.py | 13 + src/structables/main.py | 1313 +--------------------------- src/structables/routes/__init__.py | 12 + src/structables/routes/category.py | 76 ++ src/structables/routes/contest.py | 177 ++++ src/structables/routes/main.py | 362 ++++++++ src/structables/routes/member.py | 124 +++ src/structables/routes/proxy.py | 50 ++ src/structables/utils/data.py | 56 ++ src/structables/utils/helpers.py | 451 ++++++++++ 11 files changed, 1332 insertions(+), 1304 deletions(-) create mode 100644 src/structables/config.py create mode 100644 src/structables/routes/__init__.py create mode 100644 src/structables/routes/category.py create mode 100644 src/structables/routes/contest.py create mode 100644 src/structables/routes/main.py create mode 100644 src/structables/routes/member.py create mode 100644 src/structables/routes/proxy.py create mode 100644 src/structables/utils/data.py create mode 100644 src/structables/utils/helpers.py diff --git a/pyproject.toml b/pyproject.toml index b949c40..80c0f8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "structables" -version = "0.2.2" +version = "0.3.0" authors = [ { name="Private.coffee Team", email="support@private.coffee" }, ] diff --git a/src/structables/config.py b/src/structables/config.py new file mode 100644 index 0000000..fd9dc36 --- /dev/null +++ b/src/structables/config.py @@ -0,0 +1,13 @@ +import os + +class Config: + DEBUG = os.environ.get("FLASK_DEBUG", os.environ.get("STRUCTABLES_DEBUG", False)) + PORT = int(os.environ.get("STRUCTABLES_PORT", 8002)) + LISTEN_HOST = os.environ.get("STRUCTABLES_LISTEN_HOST", "127.0.0.1") + INVIDIOUS = os.environ.get("STRUCTABLES_INVIDIOUS") + UNSAFE = os.environ.get("STRUCTABLES_UNSAFE", False) + PRIVACY_FILE = os.environ.get("STRUCTABLES_PRIVACY_FILE", "privacy.txt") + + @staticmethod + def init_app(app): + pass \ No newline at end of file diff --git a/src/structables/main.py b/src/structables/main.py index cdf1c1a..65cb714 100644 --- a/src/structables/main.py +++ b/src/structables/main.py @@ -1,1313 +1,20 @@ #!/usr/bin/env python -from flask import ( - Flask, - render_template, - request, - redirect, - Response, -) - -from urllib.parse import quote, unquote, urlencode -from urllib.request import Request, urlopen -from urllib.error import HTTPError -from traceback import print_exc -from urllib.parse import urlparse -from argparse import ArgumentParser -from typing import List - -from werkzeug.exceptions import BadRequest, abort, InternalServerError, NotFound -from bs4 import BeautifulSoup - -import os -import json -import re -import logging -import pathlib - -logging.basicConfig(level=logging.DEBUG) - -global_ibles = {} - - -def proxy(url): - logging.debug(f"Generating proxy URL for {url}") - return f"/proxy/?url={url}" - - -def get_typesense_api_key(): - logging.debug("Getting Typesense API key...") - - data = urlopen("https://www.instructables.com/") - soup = BeautifulSoup(data.read().decode(), "html.parser") - scripts = soup.select("script") - - for script in scripts: - if "typesense" in script.text and ( - matches := re.search(r'"typesenseApiKey":\s?"(.*?)"', script.text) - ): - api_key = matches.group(1) - logging.debug(f"Identified Typesense API key as {api_key}") - return api_key - - logging.error("Failed to get Typesense API key") - - -TYPESENSE_API_KEY = get_typesense_api_key() - -debugmode = False -invidious = None -unsafe = False - - -def unslugify(slug: str) -> List[str]: - """Return a list of possible original titles for a slug. - - Args: - slug (str): The slug to unslugify. - - Returns: - List[str]: A list of possible original titles for the slug. - """ - - results = [] - - results.append(slug.replace("-", " ").title()) - - if "and" in slug: - results.append(results[0].replace("And", "&").title()) - - return results - - -def projects_search( - query="*", - category="", - channel="", - filter_by="", - page=1, - per_page=50, - query_by="title,stepBody,screenName", - sort_by="publishDate:desc", - timeout=5, -): - if category: - if filter_by: - filter_by += " && " - filter_by += f"category:={category}" - - if channel: - if filter_by: - filter_by += " && " - filter_by += f"channel:={channel}" - - query = quote(query) - filter_by = quote(filter_by) - - logging.debug( - f"Searching projects with query {query} and filter {filter_by}, page {page}" - ) - - projects_headers = {"x-typesense-api-key": TYPESENSE_API_KEY} - - request_args = { - "q": query, - "query_by": query_by, - "page": page, - "sort_by": sort_by, - "include_fields": "title,urlString,coverImageUrl,screenName,favorites,views,primaryClassification,featureFlag,prizeLevel,IMadeItCount", - "filter_by": filter_by, - "per_page": per_page, - } - - args_str = "&".join([f"{key}={value}" for key, value in request_args.items()]) - - projects_request = Request( - f"https://www.instructables.com/api_proxy/search/collections/projects/documents/search?{args_str}", - headers=projects_headers, - ) - - projects_data = urlopen(projects_request, timeout=timeout) - project_obj = json.loads(projects_data.read().decode()) - project_ibles = project_obj["hits"] - - logging.debug(f"Got {len(project_ibles)} projects") - - return project_ibles, project_obj["out_of"] - - -def update_data(): - logging.debug("Updating data...") - - channels = [] - - sitemap_data = urlopen("https://www.instructables.com/sitemap/") - sitemap_soup = BeautifulSoup(sitemap_data.read().decode(), "html.parser") - main = sitemap_soup.select("div.sitemap-content")[0] - - for group in main.select("div.group-section"): - channels.append(group.select("h2 a")[0].text.lower()) - - global_ibles["/projects"] = [] - project_ibles, total = projects_search(filter_by="featureFlag:=true") - - while len(global_ibles["/projects"]) <= 0: - for ible in project_ibles: - link = f"/{ible['document']['urlString']}" - img = proxy(ible["document"]["coverImageUrl"]) - - title = ible["document"]["title"] - author = ible["document"]["screenName"] - author_link = f"/member/{author}" - - channel = ible["document"]["primaryClassification"] - channel_link = f"/channel/{channel}" - - views = ible["document"]["views"] - favorites = ible["document"]["favorites"] - - global_ibles["/projects"].append( - { - "link": link, - "img": img, - "title": title, - "author": author, - "author_link": author_link, - "channel": channel, - "channel_link": channel_link, - "views": views, - "favorites": favorites, - } - ) +from flask import Flask +from .config import Config +from .routes import init_routes +from .utils.data import update_data +from .utils.helpers import get_typesense_api_key app = Flask(__name__, template_folder="templates", static_folder="static") +app.config.from_object(Config) +app.typesense_api_key = get_typesense_api_key() - -@app.route("/cron/") -def cron(): - update_data() - return "OK" - - -def explore_lists(soup): - list_ = [] - for ible in soup.select(".home-content-explore-ible"): - link = ible.a["href"] - img = proxy(ible.select("a img")[0].get("data-src")) - alt = ible.select("a img")[0].get("alt") - title = ible.select("div strong a")[0].text - author = ible.select("div span.ible-author a")[0].text - author_link = ible.select("div span.ible-author a")[0].get("href") - channel = ible.select("div span.ible-channel a")[0].text - channel_link = ible.select("div span.ible-channel a")[0].get("href") - views = 0 - if ible.select("span.ible-views") != []: - views = ible.select("span.ible-views")[0].text - favorites = 0 - if ible.select("span.ible-favorites") != []: - favorites = ible.select("span.ible-favorites")[0].text - list_.append( - { - "link": link, - "img": img, - "alt": alt, - "title": title, - "author": author, - "author_link": author_link, - "channel": channel, - "channel_link": channel_link, - "favorites": favorites, - "views": views, - } - ) - return list_ - - -def member_header(header): - avatar = proxy( - header.select("div.profile-avatar-container img.profile-avatar")[0].get("src") - ) - title = header.select("div.profile-top div.profile-headline h1.profile-title")[ - 0 - ].text - - header.select("div.profile-top")[0] - - # stats_text = profile_top.select("div.profile-header-stats")[0] - # stats_num = header.select("div.profile-top div.profile-header-stats")[1] - - location = header.select("span.member-location") - if location != []: - location = location[0].text - else: - location = 0 - - signup = header.select("span.member-signup-date") - if signup != []: - signup = signup[0].text - else: - signup = 0 - - instructables = header.select("span.ible-count") - if instructables != []: - instructables = instructables[0].text - else: - instructables = 0 - - views = header.select("span.total-views") - if views != []: - views = views[0].text - else: - views = 0 - - comments = header.select("span.total-comments") - if comments != []: - comments = comments[0].text - else: - comments = 0 - - followers = header.select("span.follower-count") - if followers != []: - followers = followers[0].text - else: - followers = 0 - - bio = header.select("span.member-bio") - if bio != []: - bio = bio[0].text - else: - bio = "" - - return { - "avatar": avatar, - "title": title, - "location": location, - "signup": signup, - "instructables": instructables, - "views": views, - "comments": comments, - "followers": followers, - "bio": bio, - } - - -def category_page(name, teachers=False): - path = urlparse(request.path).path - page = request.args.get("page", 1, type=int) - - ibles = [] - - channels = [] - contests = [] - - for channel in global_ibles["/projects"]: - if ( - channel["channel"].startswith(name.lower()) - and channel["channel"] not in channels - ): - channels.append(channel["channel"]) - - category_ibles, total = projects_search( - category=name, page=page, filter_by="featureFlag:=true" - ) - - for ible in category_ibles: - link = f"/{ible['document']['urlString']}" - img = proxy(ible["document"]["coverImageUrl"]) - - title = ible["document"]["title"] - author = ible["document"]["screenName"] - author_link = f"/member/{author}" - - channel = ible["document"]["primaryClassification"] - channel_link = f"/channel/{channel}" - - views = ible["document"]["views"] - favorites = ible["document"]["favorites"] - - ibles.append( - { - "link": link, - "img": img, - "title": title, - "author": author, - "author_link": author_link, - "channel": channel, - "channel_link": channel_link, - "views": views, - "favorites": favorites, - } - ) - - return render_template( - "category.html", - title=name, - channels=channels, - ibles=ibles, - contests=contests, - path=path, - ) - - -def get_pagination(request, total, per_page=1): - pagination = [] - - args = request.args.copy() - current = int(args.pop("page", 1)) - - query_string = urlencode(args) - - total_pages = int(total / per_page) - - if query_string: - query_string = "&" + query_string - - if current > 1: - pagination.append( - { - "link": f"?page={current - 1}{query_string}", - "text": "Previous", - "disabled": False, - "active": False, - } - ) - - for page in range(max(current - 5, 1), min(current + 5, total_pages)): - if page == current: - pagination.append( - { - "link": f"?page={page}{query_string}", - "text": page, - "disabled": False, - "active": True, - } - ) - else: - pagination.append( - { - "link": f"?page={page}{query_string}", - "text": page, - "disabled": False, - "active": False, - } - ) - - if current < total_pages: - pagination.append( - { - "link": f"?page={current + 1}{query_string}", - "text": "Next", - "disabled": False, - "active": False, - } - ) - - return pagination - - -def project_list(head, sort="", per_page=20): - head = f"{head + ' ' if head != '' else ''}Projects" + sort - path = urlparse(request.path).path - - page = request.args.get("page", 1, type=int) - - if path in ("/projects/", "/projects"): - ibles = global_ibles["/projects"] - total = len(ibles) - else: - if "projects" in path.split("/"): - ibles = [] - - parts = path.split("/") - category = parts[1] - channel = "" if parts[2] == "projects" else parts[2] - - channel_names = unslugify(channel) - - for channel_name in channel_names: - project_ibles, total = projects_search( - category=category, - channel=channel_name, - per_page=per_page, - page=page, - ) - - if project_ibles: - break - - elif "search" in path.split("/"): - ibles = [] - query = ( - request.args.get("q") if request.method == "GET" else request.form["q"] - ) - - project_ibles, total = projects_search( - query=query, - filter_by="", - per_page=per_page, - page=page, - query_by="title,screenName", - ) - - else: - abort(404) - - for ible in project_ibles: - link = f"/{ible['document']['urlString']}" - img = proxy(ible["document"]["coverImageUrl"]) - - title = ible["document"]["title"] - author = ible["document"]["screenName"] - author_link = f"/member/{author}" - - channel = ible["document"]["primaryClassification"] - channel_link = f"/channel/{channel}" - - views = ible["document"]["views"] - favorites = ible["document"]["favorites"] - - ibles.append( - { - "link": link, - "img": img, - "title": title, - "author": author, - "author_link": author_link, - "channel": channel, - "channel_link": channel_link, - "views": views, - "favorites": favorites, - } - ) - - return render_template( - "projects.html", - title=unslugify(head)[0], - ibles=ibles, - path=path, - pagination=get_pagination(request, total, per_page), - ) - - -@app.route("/sitemap/") -@app.route("/sitemap/") -def route_sitemap(path=""): - try: - data = urlopen("https://www.instructables.com/sitemap/" + path) - except HTTPError as e: - abort(e.code) - - soup = BeautifulSoup(data.read().decode(), "html.parser") - - main = soup.select("div.sitemap-content")[0] - - group_section = main.select("div.group-section") - - if group_section: - groups = [] - for group in group_section: - category = group.select("h2 a")[0].text - category_link = group.select("h2 a")[0].get("href") - channels = [] - for li in group.select("ul.sitemap-listing li"): - channel = li.a.text - channel_link = li.a["href"] - channels.append([channel, channel_link]) - groups.append([category, category_link, channels]) - - else: - groups = [] - channels = [] - for li in main.select("ul.sitemap-listing li"): - channel = li.a.text - channel_link = li.a["href"] - - if channel_link.startswith("https://"): - channel_link = f'/{"/".join(channel_link.split("/")[3:])}' - - channels.append([channel, channel_link]) - groups.append(["", "", channels]) - - return render_template("sitemap.html", title="Sitemap", groups=groups) - - -@app.route("/contest/archive/") -def route_contest_archive(): - page = 1 - if request.args.get("page") is not None: - page = request.args.get("page") - - try: - data = urlopen(f"https://www.instructables.com/contest/archive/?page={page}") - except HTTPError as e: - abort(e.code) - - soup = BeautifulSoup(data.read().decode(), "html.parser") - - main = soup.select("div#contest-archive-wrapper")[0] - - contest_count = main.select("p.contest-count")[0].text - - contest_list = [] - for index, year in enumerate(main.select("div.contest-archive-list h2")): - year_list = main.select( - "div.contest-archive-list div.contest-archive-list-year" - )[index] - year_name = year.text - month_list = [] - for month in year_list.select("div.contest-archive-list-month"): - month_name = month.select("h3")[0].text - month_contest_list = [] - for p in month.select("p"): - date = p.select("span")[0].text - link = p.select("a")[0].get("href") - title = p.select("a")[0].text - month_contest_list.append([date, link, title]) - month_list.append([month_name, month_contest_list]) - contest_list.append([year_name, month_list]) - - pagination = main.select("nav.pagination ul.pagination")[0] - - return render_template( - "archives.html", - title=f"Contest Archives (Page {page})", - page=page, - contest_count=contest_count, - pagination=pagination, - contest_list=contest_list, - ) - - -@app.route("/contest//") -def route_contest(contest): - try: - data = urlopen(f"https://www.instructables.com/contest/{contest}/") - except HTTPError as e: - abort(e.code) - - soup = BeautifulSoup(data.read().decode(), "html.parser") - - title = soup.select('meta[property="og:title"]')[0].get("content") - - body = soup.select("div#contest-wrapper")[0] - - img = proxy(body.select("div#contest-masthead img")[0].get("src")) - - entry_count = body.select("li.entries-nav-btn")[0].text.split(" ")[0] - prizes = body.select("li.prizes-nav-btn")[0].text.split(" ")[0] - - info = body.select("div.contest-body-column-left")[0] - info.select("div#site-announcements-page")[0].decompose() - info.select("h3")[0].decompose() - info.select("div#contest-body-nav")[0].decompose() - info = str(info).replace("https://www.instructables.com", "/") - - body.select("span.contest-entity-count")[0].text - - entry_list = [] - for entry in body.select("div.contest-entries-list div.contest-entries-list-ible"): - link = entry.a["href"] - entry_img = proxy(entry.select("a noscript img")[0].get("src")) - entry_title = entry.select("a.ible-title")[0].text - author = entry.select("div span.ible-author a")[0].text - author_link = entry.select("div span.ible-author a")[0].get("href") - channel = entry.select("div span.ible-channel a")[0].text - channel_link = entry.select("div span.ible-channel a")[0].get("href") - views = entry.select(".ible-views")[0].text - - entry_list.append( - { - "link": link, - "entry_img": entry_img, - "entry_title": entry_title, - "author": author, - "author_link": author_link, - "channel": channel, - "channel_link": channel_link, - "views": views, - } - ) - - return render_template( - "contest.html", - title=title, - img=img, - entry_count=entry_count, - prizes=prizes, - info=info, - entry_list=entry_list, - ) - - -@app.route("/contest/") -def route_contests(): - try: - data = urlopen("https://www.instructables.com/contest/") - except HTTPError as e: - abort(e.code) - - soup = BeautifulSoup(data.read().decode(), "html.parser") - - contest_count = str(soup.select("p.contest-count")[0]) - - contests = [] - for contest in soup.select("div#cur-contests div.row-fluid div.contest-banner"): - link = contest.select("div.contest-banner-inner a")[0].get("href") - img = proxy(contest.select("div.contest-banner-inner a img")[0].get("src")) - alt = contest.select("div.contest-banner-inner a img")[0].get("alt") - deadline = contest.select("span.contest-meta-deadline")[0].get("data-deadline") - prizes = contest.select("span.contest-meta-count")[0].text - entries = contest.select("span.contest-meta-count")[1].text - - contests.append( - { - "link": link, - "img": img, - "alt": alt, - "deadline": deadline, - "prizes": prizes, - "entries": entries, - } - ) - - closed = [] - for display in soup.select("div.contest-winner-display"): - link = display.select("div.contest-banner-inner a")[0].get("href") - img = proxy(display.select("div.contest-banner-inner a img")[0].get("src")) - alt = display.select("div.contest-banner-inner a img")[0].get("alt") - featured_items = [] - for featured_item in display.select("ul.featured-items li"): - item_link = featured_item.select("div.ible-thumb a")[0].get("href") - item_img = proxy(featured_item.select("div.ible-thumb a img")[0].get("src")) - item_title = featured_item.select("a.title")[0].text - item_author = featured_item.select("a.author")[0].text - item_author_link = featured_item.select("a.author")[0].get("href") - - featured_items.append( - { - "link": item_link, - "img": item_img, - "title": item_title, - "author": item_author, - "author_link": item_author_link, - } - ) - closed.append( - {"link": link, "img": img, "alt": alt, "featured_items": featured_items} - ) - - return render_template( - "contests.html", - title="Contests", - contest_count=contest_count, - contests=contests, - closed=closed, - ) - - -@app.route("///projects/") -def route_channel_projects(category, channel): - return project_list(channel.title()) - - -@app.route("///projects//") -def route_channel_projects_sort(category, channel, sort): - return project_list( - channel.title(), - " Sorted by " + sort.title(), - ) - - -@app.route("//projects/") -def route_category_projects(category): - return project_list(category.title()) - - -@app.route("//projects//") -def route_category_projects_sort(category, sort): - return project_list(category.title(), " Sorted by " + sort.title()) - - -@app.route("/projects/") -def route_projects(): - return project_list("") - - -@app.route("/search", methods=["POST", "GET"]) -def route_search(): - return project_list("Search") - - -@app.route("/projects//") -def route_projects_sort(sort): - return project_list("", " Sorted by " + sort.title()) - - -@app.route("/circuits/") -def route_circuits(): - return category_page("Circuits") - - -@app.route("/workshop/") -def route_workshop(): - return category_page("Workshop") - - -@app.route("/craft/") -def route_craft(): - return category_page("Craft") - - -@app.route("/cooking/") -def route_cooking(): - return category_page("Cooking") - - -@app.route("/living/") -def route_living(): - return category_page("Living") - - -@app.route("/outside/") -def route_outside(): - return category_page("Outside") - - -@app.route("/teachers/") -def route_teachers(): - return category_page("Teachers", True) - - -@app.route("/member//instructables/") -def route_member_instructables(member): - try: - data = urlopen(f"https://www.instructables.com/member/{member}/instructables/") - except HTTPError as e: - abort(e.code) - - soup = BeautifulSoup(data.read().decode(), "html.parser") - - header = soup.select(".profile-header.profile-header-social")[0] - header_content = member_header(header) - - ibles = soup.select("ul.ible-list-items")[0] - ible_list = [] - for ible in ibles.select("li"): - link = ible.select("div.thumbnail-image")[0].a.get("href") - img = proxy(ible.select("div.thumbnail-image a noscript img")[0].get("src")) - title = ible.select("div.caption-inner a.title")[0].text - - stats = ible.select("div.ible-stats-right-col")[0] - views = 0 - if stats.select("span.ible-views") != []: - views = stats.select("span.ible-views")[0].text - favorites = 0 - if stats.select("span.ible-favorites") != []: - favorites = stats.select("span.ible-favorites")[0].text - - ible_list.append( - { - "link": link, - "img": img, - "title": title, - "views": views, - "favorites": favorites, - } - ) - - return render_template( - "member-instructables.html", - title=f"{header_content['title']}'s Instructables", - header_content=header_content, - ibles=ible_list, - ) - - -@app.route("/member//") -def route_member(member): - headers = { - "User-Agent": "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/113.0" - } - - request = Request( - f"https://www.instructables.com/member/{member}/", headers=headers - ) - - try: - data = urlopen(request) - except HTTPError as e: - abort(e.code) - - soup = BeautifulSoup(data.read().decode(), "html.parser") - - header_content = member_header(soup) - - body = soup.select("div.member-profile-body")[0] - - ible_list = body.select(".boxed-content.promoted-content") - - ible_list_title = "" - ibles = [] - - if ible_list != []: - ible_list = ible_list[0] - ible_list_title = ible_list.select("h2.module-title")[0].text - for ible in ible_list.select("ul.promoted-items li"): - ible_title = ible.get("data-title") - ible_link = ible.select("div.image-wrapper")[0].a.get("href") - ible_img = proxy(ible.select("div.image-wrapper a img")[0].get("src")) - - ibles.append({"title": ible_title, "link": ible_link, "img": ible_img}) - - ach_list = body.select( - "div.two-col-section div.right-col-section.centered-sidebar div.boxed-content.about-me" - ) - - ach_list_title = "" - achs = [] - - if len(ach_list) > 1: - ach_list = ach_list[1] - ach_list_title = ach_list.select("h2.module-title")[0].text - for ach in ach_list.select( - "div.achievements-section.main-achievements.contest-achievements div.achievement-item:not(.two-column-filler)" - ): - ach_title = ach.select("div.achievement-info span.achievement-title")[ - 0 - ].text - ach_desc = ach.select("div.achievement-info span.achievement-description")[ - 0 - ].text - achs.append([ach_title, ach_desc]) - - return render_template( - "member.html", - title=header_content["title"] + "'s Profile", - header_content=header_content, - ible_list_title=ible_list_title, - ibles=ibles, - ach_list_title=ach_list_title, - achs=achs, - ) - - -@app.route("/
/") -def route_article(article): - try: - data = urlopen( - f"https://www.instructables.com/json-api/showInstructableModel?urlString={article}" - ) - data = json.loads(data.read().decode()) - except HTTPError as e: - abort(e.code) - - try: - title = data["title"] - author = data["author"]["screenName"] - author_link = f"/member/{author}" - category = data["classifications"][0]["title"] - category_slug = data["classifications"][0]["name"] - category_link = f"/{category_slug}/" - channel = data["classifications"][0]["channels"][0]["title"] - channel_slug = data["classifications"][0]["channels"][0]["name"] - channel_link = f"/{category_slug}/{channel_slug}/" - - views = data["views"] - favorites = data["favorites"] - - if "steps" in data: - steps = [] - - if "supplies" in data: - supplies = data["supplies"] - - supplies_files = [] - - if "suppliesFiles" in data: - supplies_files = data["suppliesFiles"] - - data["steps"].insert( - 1, {"title": "Supplies", "body": supplies, "files": supplies_files} - ) - - for step in data["steps"]: - step_title = step["title"] - print(step_title) - - step_imgs = [] - step_videos = [] # TODO: Check if this is still required - step_iframes = [] - step_downloads = [] - - for file in step["files"]: - print(file) - if file["image"] and "embedType" not in "file": - step_imgs.append( - {"src": proxy(file["downloadUrl"]), "alt": file["name"]} - ) - - elif not file["image"]: - step_downloads.append( - {"src": proxy(file["downloadUrl"]), "name": file["name"]} - ) - - else: # Leaves us with embeds - embed_code = file["embedHtmlCode"] - soup = BeautifulSoup(embed_code, "html.parser") - - iframe = soup.select("iframe")[0] - - src = iframe.get("src") - - if src.startswith("https://content.instructables.com"): - src = src.replace( - "https://content.instructables.com", - f"/proxy/?url={src}", - ) - - elif invidious and src.startswith("https://www.youtube.com"): - src = src.replace("https://www.youtube.com", invidious) - - elif not unsafe: - src = "/iframe/?url=" + quote(src) - - step_iframes.append( - { - "src": src, - "width": file.get("width"), - "height": file.get("height"), - } - ) - - step_text = step["body"] - step_text = step_text.replace( - "https://content.instructables.com", - "/proxy/?url=https://content.instructables.com", - ) - steps.append( - { - "title": step_title, - "imgs": step_imgs, - "text": step_text, - "videos": step_videos, - "iframes": step_iframes, - "downloads": step_downloads, - } - ) - - comments_list = [] - comment_count = 0 - - # TODO: Fix comments - - # comments = body.select("section.discussion")[0] - - # comment_count = comments.select("h2")[0].text - # comment_list = comments.select("div.posts") - - # if comment_list != []: - # comment_list = comment_list[0] - # comments_list = [] - # replies_used = 0 - # for comment in comment_list.select(".post.js-comment:not(.reply)"): - # comment_votes = comment.select(".votes")[0].text - # comment_author_img_src = proxy(comment.select(".avatar a noscript img")[0].get("src")) - # comment_author_img_alt = comment.select(".avatar a noscript img")[0].get("alt") - # comment_author = comment.select(".posted-by a")[0].text - # comment_author_link = comment.select(".posted-by a")[0].get("href") - # comment_date = comment.select(".posted-by p.posted-date")[0].text - # comment_text = comment.select("div.text p")[0] - # comment_reply_count = comment.select("button.js-show-replies") - # if comment_reply_count != []: - # comment_reply_count = comment_reply_count[0].get("data-num-hidden") - # else: - # comment_reply_count = 0 - # reply_list = [] - # for index, reply in enumerate(comment_list.select(".post.js-comment:not(.reply) ~ .post.js-comment.reply.hide:has(~.post.js-comment:not(.reply))")[replies_used:int(comment_reply_count) + replies_used]): - # reply_votes = reply.select(".votes")[0].text - # reply_author_img_src = proxy(reply.select(".avatar a noscript img")[0].get("src")) - # reply_author_img_alt = reply.select(".avatar a noscript img")[0].get("alt") - # reply_author = reply.select(".posted-by a")[0].text - # reply_author_link = reply.select(".posted-by a")[0].get("href") - # reply_date = reply.select(".posted-by p.posted-date")[0].text - # reply_text = reply.select("div.text p")[0] - - # reply_list.append([reply_votes, reply_author_img_src, reply_author_img_alt, reply_author, reply_author_link, reply_date, reply_text]) - # replies_used += 1 - - # comments_list.append([comment_votes, comment_author_img_src, comment_author_img_alt, comment_author, comment_author_link, comment_date, comment_text, comment_reply_count, reply_list]) - return render_template( - "article.html", - title=title, - author=author, - author_link=author_link, - category=category, - category_link=category_link, - channel=channel, - channel_link=channel_link, - views=views, - favorites=favorites, - steps=steps, - comment_count=comment_count, - comments_list=comments_list, - enumerate=enumerate, - ) - else: - ## Collections - thumbnails = [] - for thumbnail in data["instructables"]: - text = thumbnail["title"] - link = thumbnail["showUrl"] - img = proxy(thumbnail["downloadUrl"]) - thumbnail_title = thumbnail["title"] - thumbnail_author = thumbnail["author"]["screenName"] - thumbnail_author_link = f"/member/{thumbnail_author}" - thumbnail_channel = thumbnail["classifications"][0]["channels"][0][ - "title" - ] - thumbnail_category = thumbnail["classifications"][0]["title"] - thumbnail_channel_link = f"/{thumbnail_category}/{thumbnail_channel}" - - thumbnails.append( - { - "text": text, - "link": link, - "img": img, - "title": thumbnail_title, - "author": thumbnail_author, - "author_link": thumbnail_author_link, - "channel": thumbnail_channel, - "channel_link": thumbnail_channel_link, - } - ) - - print(thumbnails[-1]) - - return render_template( - "collection.html", - title=title, - author=author, - author_link=author_link, - category=category, - category_link=category_link, - channel=channel, - channel_link=channel_link, - views=views, - favorites=favorites, - thumbnails=thumbnails, - ) - - except Exception: - print_exc() - raise InternalServerError() - - -@app.route("///") -def route_channel_redirect(category, channel): - # TODO: Just check if the channel exists - if ( - category == "circuits" - or category == "workshop" - or category == "craft" - or category == "cooking" - or category == "living" - or category == "outside" - or category == "teachers" - ): - return redirect(f"/{category}/{channel}/projects/", 307) - else: - raise NotFound() - - -@app.route("/") -def route_explore(): - try: - data = urlopen("https://www.instructables.com/") - except HTTPError as e: - abort(e.code) - - soup = BeautifulSoup(data.read().decode(), "html.parser") - - explore = soup.select(".home-content-explore-wrap")[0] - - title = explore.select("h2")[0].text - - circuits = explore_lists( - explore.select(".home-content-explore-category-circuits")[0] - ) - workshop = explore_lists( - explore.select(".home-content-explore-category-workshop")[0] - ) - craft = explore_lists(explore.select(".home-content-explore-category-craft")[0]) - cooking = explore_lists(explore.select(".home-content-explore-category-cooking")[0]) - living = explore_lists(explore.select(".home-content-explore-category-living")[0]) - outside = explore_lists(explore.select(".home-content-explore-category-outside")[0]) - teachers = explore_lists( - explore.select(".home-content-explore-category-teachers")[0] - ) - - return render_template( - "index.html", - title=title, - sections=[ - ("Circuits", "/circuits", circuits), - ("Workshop", "/workshop", workshop), - ("Craft", "/craft", craft), - ("Cooking", "/cooking", cooking), - ("Living", "/living", living), - ("Outside", "/outside", outside), - ("Teachers", "/teachers", teachers), - ], - ) - - -@app.route("/proxy/") -def route_proxy(): - url = request.args.get("url") - if url is not None: - if url.startswith("https://cdn.instructables.com/") or url.startswith( - "https://content.instructables.com/" - ): - - def generate(): - # Subfunction to allow streaming the data instead of - # downloading all of it at once - try: - with urlopen(unquote(url)) as data: - while True: - chunk = data.read(1024 * 1024) - if not chunk: - break - yield chunk - except HTTPError as e: - abort(e.code) - - try: - with urlopen(unquote(url)) as data: - content_type = data.headers["content-type"] - except HTTPError as e: - abort(e.code) - except KeyError: - raise InternalServerError() - - return Response(generate(), content_type=content_type) - else: - raise BadRequest() - else: - raise BadRequest() - - -@app.route("/iframe/") -def route_iframe(): - url = request.args.get("url") - url = unquote(url) - if url is not None: - return render_template("iframe.html", url=url) - else: - raise BadRequest() - - -@app.route("/privacypolicy/") -def privacypolicy(): - content = "No privacy policy found." - - try: - with (pathlib.Path(__file__).parent / "privacy.txt").open() as f: - content = f.read() - except OSError: - pass - - return render_template( - "privacypolicy.html", title="Privacy Policy", content=content - ) - - -@app.errorhandler(404) -def not_found(e): - return render_template("404.html"), 404 - - -@app.errorhandler(400) -def bad_request(e): - return render_template("400.html"), 400 - - -@app.errorhandler(429) -def too_many_requests(e): - return render_template("429.html"), 429 - - -@app.errorhandler(500) -def internal_server_error(e): - return render_template("500.html"), 500 - - -def main(): - global debugmode, invidious, unsafe - - parser = ArgumentParser() - parser.add_argument( - "-p", - "--port", - default=8002, - type=int, - help="Port to listen on", - ) - parser.add_argument( - "-d", - "--debug", - action="store_true", - help="Enable debug mode", - ) - parser.add_argument( - "-l", - "--listen-host", - default="127.0.0.1", - help="Host to listen on", - ) - parser.add_argument( - "-I", - "--invidious", - help="URL to Invidious instance, e.g. https://invidious.private.coffee/", - ) - parser.add_argument( - "-u", - "--unsafe", - action="store_true", - help="Display iframes regardless of origin", - ) - parser.add_argument( - "-P", - "--privacy-file", - default="privacy.txt", - help="File to read privacy policy from", - ) - args = parser.parse_args() - - debugmode = os.environ.get( - "FLASK_DEBUG", os.environ.get("STRUCTABLES_DEBUG", False) - ) - invidious = os.environ.get("STRUCTABLES_INVIDIOUS") - unsafe = os.environ.get("STRUCTABLES_UNSAFE", False) - - if args.debug: - debugmode = True - - if args.invidious: - invidious = args.invidious - - if args.unsafe: - unsafe = True - - if debugmode: - app.logger.setLevel(logging.DEBUG) - - app.run(port=args.port, host=args.listen_host, debug=debugmode) - +init_routes(app) if __name__ == "__main__": - main() + app.run(port=app.config['PORT'], host=app.config['LISTEN_HOST'], debug=app.config['DEBUG']) # Initialize data when the server starts -update_data() +update_data(app) \ No newline at end of file diff --git a/src/structables/routes/__init__.py b/src/structables/routes/__init__.py new file mode 100644 index 0000000..011338e --- /dev/null +++ b/src/structables/routes/__init__.py @@ -0,0 +1,12 @@ +from .main import init_main_routes +from .category import init_category_routes +from .member import init_member_routes +from .proxy import init_proxy_routes +from .contest import init_contest_routes + +def init_routes(app): + init_main_routes(app) + init_category_routes(app) + init_member_routes(app) + init_proxy_routes(app) + init_contest_routes(app) \ No newline at end of file diff --git a/src/structables/routes/category.py b/src/structables/routes/category.py new file mode 100644 index 0000000..eb3c518 --- /dev/null +++ b/src/structables/routes/category.py @@ -0,0 +1,76 @@ +from flask import redirect +from werkzeug.exceptions import NotFound +from ..utils.helpers import project_list, category_page + + +def init_category_routes(app): + @app.route("///projects/") + def route_channel_projects(category, channel): + return project_list(app, channel.title()) + + @app.route("///projects//") + def route_channel_projects_sort(category, channel, sort): + return project_list( + app, + channel.title(), + " Sorted by " + sort.title(), + ) + + @app.route("//projects/") + def route_category_projects(category): + return project_list(app, category.title()) + + @app.route("//projects//") + def route_category_projects_sort(category, sort): + return project_list(app, category.title(), " Sorted by " + sort.title()) + + @app.route("/projects/") + def route_projects(): + return project_list(app, "") + + @app.route("/projects//") + def route_projects_sort(sort): + return project_list(app, "", " Sorted by " + sort.title()) + + @app.route("/circuits/") + def route_circuits(): + return category_page(app, "Circuits") + + @app.route("/workshop/") + def route_workshop(): + return category_page(app, "Workshop") + + @app.route("/craft/") + def route_craft(): + return category_page(app, "Craft") + + @app.route("/cooking/") + def route_cooking(): + return category_page(app, "Cooking") + + @app.route("/living/") + def route_living(): + return category_page(app, "Living") + + @app.route("/outside/") + def route_outside(): + return category_page(app, "Outside") + + @app.route("/teachers/") + def route_teachers(): + return category_page(app, "Teachers", True) + + @app.route("///") + def route_channel_redirect(category, channel): + if ( + category == "circuits" + or category == "workshop" + or category == "craft" + or category == "cooking" + or category == "living" + or category == "outside" + or category == "teachers" + ): + return redirect(f"/{category}/{channel}/projects/", 307) + else: + raise NotFound() diff --git a/src/structables/routes/contest.py b/src/structables/routes/contest.py new file mode 100644 index 0000000..48fef2f --- /dev/null +++ b/src/structables/routes/contest.py @@ -0,0 +1,177 @@ +from flask import render_template, request, abort +from urllib.request import urlopen +from urllib.error import HTTPError +from ..utils.helpers import proxy +from bs4 import BeautifulSoup + +def init_contest_routes(app): + @app.route("/contest/archive/") + def route_contest_archive(): + page = 1 + if request.args.get("page") is not None: + page = request.args.get("page") + + try: + data = urlopen(f"https://www.instructables.com/contest/archive/?page={page}") + except HTTPError as e: + abort(e.code) + + soup = BeautifulSoup(data.read().decode(), "html.parser") + + main = soup.select("div#contest-archive-wrapper")[0] + + contest_count = main.select("p.contest-count")[0].text + + contest_list = [] + for index, year in enumerate(main.select("div.contest-archive-list h2")): + year_list = main.select( + "div.contest-archive-list div.contest-archive-list-year" + )[index] + year_name = year.text + month_list = [] + for month in year_list.select("div.contest-archive-list-month"): + month_name = month.select("h3")[0].text + month_contest_list = [] + for p in month.select("p"): + date = p.select("span")[0].text + link = p.select("a")[0].get("href") + title = p.select("a")[0].text + month_contest_list.append([date, link, title]) + month_list.append([month_name, month_contest_list]) + contest_list.append([year_name, month_list]) + + pagination = main.select("nav.pagination ul.pagination")[0] + + return render_template( + "archives.html", + title=f"Contest Archives (Page {page})", + page=page, + contest_count=contest_count, + pagination=pagination, + contest_list=contest_list, + ) + + @app.route("/contest//") + def route_contest(contest): + try: + data = urlopen(f"https://www.instructables.com/contest/{contest}/") + except HTTPError as e: + abort(e.code) + + soup = BeautifulSoup(data.read().decode(), "html.parser") + + title = soup.select('meta[property="og:title"]')[0].get("content") + + body = soup.select("div#contest-wrapper")[0] + + img = proxy(body.select("div#contest-masthead img")[0].get("src")) + + entry_count = body.select("li.entries-nav-btn")[0].text.split(" ")[0] + prizes = body.select("li.prizes-nav-btn")[0].text.split(" ")[0] + + info = body.select("div.contest-body-column-left")[0] + info.select("div#site-announcements-page")[0].decompose() + info.select("h3")[0].decompose() + info.select("div#contest-body-nav")[0].decompose() + info = str(info).replace("https://www.instructables.com", "/") + + body.select("span.contest-entity-count")[0].text + + entry_list = [] + for entry in body.select("div.contest-entries-list div.contest-entries-list-ible"): + link = entry.a["href"] + entry_img = proxy(entry.select("a noscript img")[0].get("src")) + entry_title = entry.select("a.ible-title")[0].text + author = entry.select("div span.ible-author a")[0].text + author_link = entry.select("div span.ible-author a")[0].get("href") + channel = entry.select("div span.ible-channel a")[0].text + channel_link = entry.select("div span.ible-channel a")[0].get("href") + views = entry.select(".ible-views")[0].text + + entry_list.append( + { + "link": link, + "entry_img": entry_img, + "entry_title": entry_title, + "author": author, + "author_link": author_link, + "channel": channel, + "channel_link": channel_link, + "views": views, + } + ) + + return render_template( + "contest.html", + title=title, + img=img, + entry_count=entry_count, + prizes=prizes, + info=info, + entry_list=entry_list, + ) + + @app.route("/contest/") + def route_contests(): + try: + data = urlopen("https://www.instructables.com/contest/") + except HTTPError as e: + abort(e.code) + + soup = BeautifulSoup(data.read().decode(), "html.parser") + + contest_count = str(soup.select("p.contest-count")[0]) + + contests = [] + for contest in soup.select("div#cur-contests div.row-fluid div.contest-banner"): + link = contest.select("div.contest-banner-inner a")[0].get("href") + img = proxy(contest.select("div.contest-banner-inner a img")[0].get("src")) + alt = contest.select("div.contest-banner-inner a img")[0].get("alt") + deadline = contest.select("span.contest-meta-deadline")[0].get("data-deadline") + prizes = contest.select("span.contest-meta-count")[0].text + entries = contest.select("span.contest-meta-count")[1].text + + contests.append( + { + "link": link, + "img": img, + "alt": alt, + "deadline": deadline, + "prizes": prizes, + "entries": entries, + } + ) + + closed = [] + for display in soup.select("div.contest-winner-display"): + link = display.select("div.contest-banner-inner a")[0].get("href") + img = proxy(display.select("div.contest-banner-inner a img")[0].get("src")) + alt = display.select("div.contest-banner-inner a img")[0].get("alt") + featured_items = [] + for featured_item in display.select("ul.featured-items li"): + item_link = featured_item.select("div.ible-thumb a")[0].get("href") + item_img = proxy(featured_item.select("div.ible-thumb a img")[0].get("src")) + item_title = featured_item.select("a.title")[0].text + item_author = featured_item.select("a.author")[0].text + item_author_link = featured_item.select("a.author")[0].get("href") + + featured_items.append( + { + "link": item_link, + "img": item_img, + "title": item_title, + "author": item_author, + "author_link": item_author_link, + } + ) + closed.append( + {"link": link, "img": img, "alt": alt, "featured_items": featured_items} + ) + + return render_template( + "contests.html", + title="Contests", + contest_count=contest_count, + contests=contests, + closed=closed, + ) \ No newline at end of file diff --git a/src/structables/routes/main.py b/src/structables/routes/main.py new file mode 100644 index 0000000..1160062 --- /dev/null +++ b/src/structables/routes/main.py @@ -0,0 +1,362 @@ +from flask import render_template, abort +from urllib.request import urlopen +from urllib.error import HTTPError +from bs4 import BeautifulSoup +from urllib.parse import quote +from werkzeug.exceptions import InternalServerError +from traceback import print_exc +import pathlib +import json + +from ..utils.data import update_data +from ..utils.helpers import explore_lists, proxy +from .category import project_list + + +def init_main_routes(app): + @app.route("/") + def route_explore(): + try: + data = urlopen("https://www.instructables.com/") + except HTTPError as e: + abort(e.code) + + soup = BeautifulSoup(data.read().decode(), "html.parser") + + explore = soup.select(".home-content-explore-wrap")[0] + + title = explore.select("h2")[0].text + + circuits = explore_lists( + explore.select(".home-content-explore-category-circuits")[0] + ) + workshop = explore_lists( + explore.select(".home-content-explore-category-workshop")[0] + ) + craft = explore_lists(explore.select(".home-content-explore-category-craft")[0]) + cooking = explore_lists( + explore.select(".home-content-explore-category-cooking")[0] + ) + living = explore_lists( + explore.select(".home-content-explore-category-living")[0] + ) + outside = explore_lists( + explore.select(".home-content-explore-category-outside")[0] + ) + teachers = explore_lists( + explore.select(".home-content-explore-category-teachers")[0] + ) + + return render_template( + "index.html", + title=title, + sections=[ + ("Circuits", "/circuits", circuits), + ("Workshop", "/workshop", workshop), + ("Craft", "/craft", craft), + ("Cooking", "/cooking", cooking), + ("Living", "/living", living), + ("Outside", "/outside", outside), + ("Teachers", "/teachers", teachers), + ], + ) + + @app.route("/sitemap/") + @app.route("/sitemap/") + def route_sitemap(path=""): + try: + data = urlopen("https://www.instructables.com/sitemap/" + path) + except HTTPError as e: + abort(e.code) + + soup = BeautifulSoup(data.read().decode(), "html.parser") + + main = soup.select("div.sitemap-content")[0] + + group_section = main.select("div.group-section") + + if group_section: + groups = [] + for group in group_section: + category = group.select("h2 a")[0].text + category_link = group.select("h2 a")[0].get("href") + channels = [] + for li in group.select("ul.sitemap-listing li"): + channel = li.a.text + channel_link = li.a["href"] + channels.append([channel, channel_link]) + groups.append([category, category_link, channels]) + + else: + groups = [] + channels = [] + for li in main.select("ul.sitemap-listing li"): + channel = li.a.text + channel_link = li.a["href"] + + if channel_link.startswith("https://"): + channel_link = f'/{"/".join(channel_link.split("/")[3:])}' + + channels.append([channel, channel_link]) + groups.append(["", "", channels]) + + return render_template("sitemap.html", title="Sitemap", groups=groups) + + @app.route("/
/") + def route_article(article): + try: + data = urlopen( + f"https://www.instructables.com/json-api/showInstructableModel?urlString={article}" + ) + data = json.loads(data.read().decode()) + except HTTPError as e: + abort(e.code) + + try: + title = data["title"] + author = data["author"]["screenName"] + author_link = f"/member/{author}" + category = data["classifications"][0]["title"] + category_slug = data["classifications"][0]["name"] + category_link = f"/{category_slug}/" + channel = data["classifications"][0]["channels"][0]["title"] + channel_slug = data["classifications"][0]["channels"][0]["name"] + channel_link = f"/{category_slug}/{channel_slug}/" + + views = data["views"] + favorites = data["favorites"] + + if "steps" in data: + steps = [] + + if "supplies" in data: + supplies = data["supplies"] + + supplies_files = [] + + if "suppliesFiles" in data: + supplies_files = data["suppliesFiles"] + + data["steps"].insert( + 1, + { + "title": "Supplies", + "body": supplies, + "files": supplies_files, + }, + ) + + for step in data["steps"]: + step_title = step["title"] + + step_imgs = [] + step_videos = [] # TODO: Check if this is still required + step_iframes = [] + step_downloads = [] + + for file in step["files"]: + if file["image"] and "embedType" not in "file": + step_imgs.append( + {"src": proxy(file["downloadUrl"]), "alt": file["name"]} + ) + + elif not file["image"]: + step_downloads.append( + { + "src": proxy(file["downloadUrl"]), + "name": file["name"], + } + ) + + else: # Leaves us with embeds + embed_code = file["embedHtmlCode"] + soup = BeautifulSoup(embed_code, "html.parser") + + iframe = soup.select("iframe")[0] + + src = iframe.get("src") + + if src.startswith("https://content.instructables.com"): + src = src.replace( + "https://content.instructables.com", + f"/proxy/?url={src}", + ) + + elif app.config["INVIDIOUS"] and src.startswith( + "https://www.youtube.com" + ): + src = src.replace( + "https://www.youtube.com", app.config["INVIDIOUS"] + ) + + elif not app.config["UNSAFE"]: + src = "/iframe/?url=" + quote(src) + + step_iframes.append( + { + "src": src, + "width": file.get("width"), + "height": file.get("height"), + } + ) + + step_text = step["body"] + step_text = step_text.replace( + "https://content.instructables.com", + "/proxy/?url=https://content.instructables.com", + ) + steps.append( + { + "title": step_title, + "imgs": step_imgs, + "text": step_text, + "videos": step_videos, + "iframes": step_iframes, + "downloads": step_downloads, + } + ) + + comments_list = [] + comment_count = 0 + + # TODO: Fix comments + + # comments = body.select("section.discussion")[0] + + # comment_count = comments.select("h2")[0].text + # comment_list = comments.select("div.posts") + + # if comment_list != []: + # comment_list = comment_list[0] + # comments_list = [] + # replies_used = 0 + # for comment in comment_list.select(".post.js-comment:not(.reply)"): + # comment_votes = comment.select(".votes")[0].text + # comment_author_img_src = proxy(comment.select(".avatar a noscript img")[0].get("src")) + # comment_author_img_alt = comment.select(".avatar a noscript img")[0].get("alt") + # comment_author = comment.select(".posted-by a")[0].text + # comment_author_link = comment.select(".posted-by a")[0].get("href") + # comment_date = comment.select(".posted-by p.posted-date")[0].text + # comment_text = comment.select("div.text p")[0] + # comment_reply_count = comment.select("button.js-show-replies") + # if comment_reply_count != []: + # comment_reply_count = comment_reply_count[0].get("data-num-hidden") + # else: + # comment_reply_count = 0 + # reply_list = [] + # for index, reply in enumerate(comment_list.select(".post.js-comment:not(.reply) ~ .post.js-comment.reply.hide:has(~.post.js-comment:not(.reply))")[replies_used:int(comment_reply_count) + replies_used]): + # reply_votes = reply.select(".votes")[0].text + # reply_author_img_src = proxy(reply.select(".avatar a noscript img")[0].get("src")) + # reply_author_img_alt = reply.select(".avatar a noscript img")[0].get("alt") + # reply_author = reply.select(".posted-by a")[0].text + # reply_author_link = reply.select(".posted-by a")[0].get("href") + # reply_date = reply.select(".posted-by p.posted-date")[0].text + # reply_text = reply.select("div.text p")[0] + + # reply_list.append([reply_votes, reply_author_img_src, reply_author_img_alt, reply_author, reply_author_link, reply_date, reply_text]) + # replies_used += 1 + + # comments_list.append([comment_votes, comment_author_img_src, comment_author_img_alt, comment_author, comment_author_link, comment_date, comment_text, comment_reply_count, reply_list]) + return render_template( + "article.html", + title=title, + author=author, + author_link=author_link, + category=category, + category_link=category_link, + channel=channel, + channel_link=channel_link, + views=views, + favorites=favorites, + steps=steps, + comment_count=comment_count, + comments_list=comments_list, + enumerate=enumerate, + ) + else: + ## Collections + thumbnails = [] + for thumbnail in data["instructables"]: + text = thumbnail["title"] + link = thumbnail["showUrl"] + img = proxy(thumbnail["downloadUrl"]) + thumbnail_title = thumbnail["title"] + thumbnail_author = thumbnail["author"]["screenName"] + thumbnail_author_link = f"/member/{thumbnail_author}" + thumbnail_channel = thumbnail["classifications"][0]["channels"][0][ + "title" + ] + thumbnail_category = thumbnail["classifications"][0]["title"] + thumbnail_channel_link = ( + f"/{thumbnail_category}/{thumbnail_channel}" + ) + + thumbnails.append( + { + "text": text, + "link": link, + "img": img, + "title": thumbnail_title, + "author": thumbnail_author, + "author_link": thumbnail_author_link, + "channel": thumbnail_channel, + "channel_link": thumbnail_channel_link, + } + ) + + return render_template( + "collection.html", + title=title, + author=author, + author_link=author_link, + category=category, + category_link=category_link, + channel=channel, + channel_link=channel_link, + views=views, + favorites=favorites, + thumbnails=thumbnails, + ) + + except Exception: + print_exc() + raise InternalServerError() + + @app.route("/search", methods=["POST", "GET"]) + def route_search(): + return project_list(app, "Search") + + @app.route("/cron/") + def cron(): + update_data(app) + return "OK" + + @app.route("/privacypolicy/") + def privacypolicy(): + content = "No privacy policy found." + + try: + with (pathlib.Path(__file__).parent / "privacy.txt").open() as f: + content = f.read() + except OSError: + pass + + return render_template( + "privacypolicy.html", title="Privacy Policy", content=content + ) + + @app.errorhandler(404) + def not_found(e): + return render_template("404.html"), 404 + + @app.errorhandler(400) + def bad_request(e): + return render_template("400.html"), 400 + + @app.errorhandler(429) + def too_many_requests(e): + return render_template("429.html"), 429 + + @app.errorhandler(500) + def internal_server_error(e): + return render_template("500.html"), 500 diff --git a/src/structables/routes/member.py b/src/structables/routes/member.py new file mode 100644 index 0000000..d425782 --- /dev/null +++ b/src/structables/routes/member.py @@ -0,0 +1,124 @@ +from flask import render_template, abort +from urllib.request import urlopen +from urllib.error import HTTPError +from ..utils.helpers import proxy, member_header +from bs4 import BeautifulSoup +from urllib.request import Request + + +def init_member_routes(app): + @app.route("/member//instructables/") + def route_member_instructables(member): + try: + data = urlopen( + f"https://www.instructables.com/member/{member}/instructables/" + ) + except HTTPError as e: + abort(e.code) + + soup = BeautifulSoup(data.read().decode(), "html.parser") + + header = soup.select(".profile-header.profile-header-social")[0] + header_content = member_header(header) + + ibles = soup.select("ul.ible-list-items")[0] + ible_list = [] + for ible in ibles.select("li"): + link = ible.select("div.thumbnail-image")[0].a.get("href") + img = proxy(ible.select("div.thumbnail-image a noscript img")[0].get("src")) + title = ible.select("div.caption-inner a.title")[0].text + + stats = ible.select("div.ible-stats-right-col")[0] + views = 0 + if stats.select("span.ible-views") != []: + views = stats.select("span.ible-views")[0].text + favorites = 0 + if stats.select("span.ible-favorites") != []: + favorites = stats.select("span.ible-favorites")[0].text + + ible_list.append( + { + "link": link, + "img": img, + "title": title, + "views": views, + "favorites": favorites, + } + ) + + return render_template( + "member-instructables.html", + title=f"{header_content['title']}'s Instructables", + header_content=header_content, + ibles=ible_list, + ) + + @app.route("/member//") + def route_member(member): + headers = { + "User-Agent": "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/113.0" + } + + request = Request( + f"https://www.instructables.com/member/{member}/", headers=headers + ) + + try: + data = urlopen(request) + except HTTPError as e: + abort(e.code) + + soup = BeautifulSoup(data.read().decode(), "html.parser") + + header_content = member_header(soup) + + body = soup.select("div.member-profile-body")[0] + + ible_list = body.select(".boxed-content.promoted-content") + + ible_list_title = "" + ibles = [] + + if ible_list != []: + ible_list = ible_list[0] + ible_list_title = ible_list.select("h2.module-title")[0].text + for ible in ible_list.select("ul.promoted-items li"): + ible_title = ible.get("data-title") + ible_link = ible.select("div.image-wrapper")[0].a.get("href") + ible_img = proxy(ible.select("div.image-wrapper a img")[0].get("src")) + + ibles.append({"title": ible_title, "link": ible_link, "img": ible_img}) + + ach_list = body.select( + "div.two-col-section div.right-col-section.centered-sidebar div.boxed-content.about-me" + ) + + ach_list_title = "" + achs = [] + + if len(ach_list) > 1: + ach_list = ach_list[1] + ach_list_title = ach_list.select("h2.module-title")[0].text + for ach in ach_list.select( + "div.achievements-section.main-achievements.contest-achievements div.achievement-item:not(.two-column-filler)" + ): + try: + ach_title = ach.select("div.achievement-info span.achievement-title")[ + 0 + ].text + ach_desc = ach.select( + "div.achievement-info span.achievement-description" + )[0].text + achs.append([ach_title, ach_desc]) + except IndexError: + pass + + return render_template( + "member.html", + title=header_content["title"] + "'s Profile", + header_content=header_content, + ible_list_title=ible_list_title, + ibles=ibles, + ach_list_title=ach_list_title, + achs=achs, + ) diff --git a/src/structables/routes/proxy.py b/src/structables/routes/proxy.py new file mode 100644 index 0000000..5471573 --- /dev/null +++ b/src/structables/routes/proxy.py @@ -0,0 +1,50 @@ +from flask import render_template, request, Response, abort +from werkzeug.exceptions import BadRequest, InternalServerError +from urllib.parse import unquote +from urllib.error import HTTPError +from urllib.request import urlopen + +def init_proxy_routes(app): + @app.route("/proxy/") + def route_proxy(): + url = request.args.get("url") + if url is not None: + if url.startswith("https://cdn.instructables.com/") or url.startswith( + "https://content.instructables.com/" + ): + + def generate(): + # Subfunction to allow streaming the data instead of + # downloading all of it at once + try: + with urlopen(unquote(url)) as data: + while True: + chunk = data.read(1024 * 1024) + if not chunk: + break + yield chunk + except HTTPError as e: + abort(e.code) + + try: + with urlopen(unquote(url)) as data: + content_type = data.headers["content-type"] + except HTTPError as e: + abort(e.code) + except KeyError: + raise InternalServerError() + + return Response(generate(), content_type=content_type) + else: + raise BadRequest() + else: + raise BadRequest() + + @app.route("/iframe/") + def route_iframe(): + url = request.args.get("url") + url = unquote(url) + if url is not None: + return render_template("iframe.html", url=url) + else: + raise BadRequest() \ No newline at end of file diff --git a/src/structables/utils/data.py b/src/structables/utils/data.py new file mode 100644 index 0000000..f713370 --- /dev/null +++ b/src/structables/utils/data.py @@ -0,0 +1,56 @@ +from urllib.request import urlopen +import logging +from bs4 import BeautifulSoup +from .helpers import proxy, projects_search + +logging.basicConfig(level=logging.DEBUG) + + +def update_data(app): + logging.debug("Updating data...") + + channels = [] + + try: + app.global_ibles + except AttributeError: + app.global_ibles = {} + + sitemap_data = urlopen("https://www.instructables.com/sitemap/") + sitemap_soup = BeautifulSoup(sitemap_data.read().decode(), "html.parser") + main = sitemap_soup.select("div.sitemap-content")[0] + + for group in main.select("div.group-section"): + channels.append(group.select("h2 a")[0].text.lower()) + + app.global_ibles["/projects"] = [] + project_ibles, total = projects_search(app, filter_by="featureFlag:=true") + + while len(app.global_ibles["/projects"]) <= 0: + for ible in project_ibles: + link = f"/{ible['document']['urlString']}" + img = proxy(ible["document"]["coverImageUrl"]) + + title = ible["document"]["title"] + author = ible["document"]["screenName"] + author_link = f"/member/{author}" + + channel = ible["document"]["primaryClassification"] + channel_link = f"/channel/{channel}" + + views = ible["document"]["views"] + favorites = ible["document"]["favorites"] + + app.global_ibles["/projects"].append( + { + "link": link, + "img": img, + "title": title, + "author": author, + "author_link": author_link, + "channel": channel, + "channel_link": channel_link, + "views": views, + "favorites": favorites, + } + ) diff --git a/src/structables/utils/helpers.py b/src/structables/utils/helpers.py new file mode 100644 index 0000000..1ea04be --- /dev/null +++ b/src/structables/utils/helpers.py @@ -0,0 +1,451 @@ +from urllib.parse import urlencode, urlparse, quote +from urllib.request import urlopen, Request +from bs4 import BeautifulSoup +import re +import logging +import json +from flask import request, render_template, abort + +logging.basicConfig(level=logging.DEBUG) + + +def proxy(url): + logging.debug(f"Generating proxy URL for {url}") + return f"/proxy/?url={url}" + + +def get_typesense_api_key(): + logging.debug("Getting Typesense API key...") + + data = urlopen("https://www.instructables.com/") + soup = BeautifulSoup(data.read().decode(), "html.parser") + scripts = soup.select("script") + + for script in scripts: + if "typesense" in script.text and ( + matches := re.search(r'"typesenseApiKey":\s?"(.*?)"', script.text) + ): + api_key = matches.group(1) + logging.debug(f"Identified Typesense API key as {api_key}") + return api_key + + logging.error("Failed to get Typesense API key") + + +def unslugify(slug): + """Return a list of possible original titles for a slug. + + Args: + slug (str): The slug to unslugify. + + Returns: + List[str]: A list of possible original titles for the slug. + """ + results = [] + + results.append(slug.replace("-", " ").title()) + + if "and" in slug: + results.append(results[0].replace("And", "&").title()) + + return results + + +def get_pagination(request, total, per_page=1): + pagination = [] + + args = request.args.copy() + current = int(args.pop("page", 1)) + + query_string = urlencode(args) + + total_pages = int(total / per_page) + + if query_string: + query_string = "&" + query_string + + if current > 1: + pagination.append( + { + "link": f"?page={current - 1}{query_string}", + "text": "Previous", + "disabled": False, + "active": False, + } + ) + + for page in range(max(current - 5, 1), min(current + 5, total_pages)): + if page == current: + pagination.append( + { + "link": f"?page={page}{query_string}", + "text": page, + "disabled": False, + "active": True, + } + ) + else: + pagination.append( + { + "link": f"?page={page}{query_string}", + "text": page, + "disabled": False, + "active": False, + } + ) + + if current < total_pages: + pagination.append( + { + "link": f"?page={current + 1}{query_string}", + "text": "Next", + "disabled": False, + "active": False, + } + ) + + return pagination + + +def member_header(header): + avatar = proxy( + header.select("div.profile-avatar-container img.profile-avatar")[0].get("src") + ) + title = header.select("div.profile-top div.profile-headline h1.profile-title")[ + 0 + ].text + + location = header.select("span.member-location") + if location != []: + location = location[0].text + else: + location = 0 + + signup = header.select("span.member-signup-date") + if signup != []: + signup = signup[0].text + else: + signup = 0 + + instructables = header.select("span.ible-count") + if instructables != []: + instructables = instructables[0].text + else: + instructables = 0 + + views = header.select("span.total-views") + if views != []: + views = views[0].text + else: + views = 0 + + comments = header.select("span.total-comments") + if comments != []: + comments = comments[0].text + else: + comments = 0 + + followers = header.select("span.follower-count") + if followers != []: + followers = followers[0].text + else: + followers = 0 + + bio = header.select("span.member-bio") + if bio != []: + bio = bio[0].text + else: + bio = "" + + return { + "avatar": avatar, + "title": title, + "location": location, + "signup": signup, + "instructables": instructables, + "views": views, + "comments": comments, + "followers": followers, + "bio": bio, + } + + +def explore_lists(soup): + list_ = [] + for ible in soup.select(".home-content-explore-ible"): + link = ible.a["href"] + img = proxy(ible.select("a img")[0].get("data-src")) + alt = ible.select("a img")[0].get("alt") + title = ible.select("div strong a")[0].text + author = ible.select("div span.ible-author a")[0].text + author_link = ible.select("div span.ible-author a")[0].get("href") + channel = ible.select("div span.ible-channel a")[0].text + channel_link = ible.select("div span.ible-channel a")[0].get("href") + views = 0 + if ible.select("span.ible-views") != []: + views = ible.select("span.ible-views")[0].text + favorites = 0 + if ible.select("span.ible-favorites") != []: + favorites = ible.select("span.ible-favorites")[0].text + list_.append( + { + "link": link, + "img": img, + "alt": alt, + "title": title, + "author": author, + "author_link": author_link, + "channel": channel, + "channel_link": channel_link, + "favorites": favorites, + "views": views, + } + ) + return list_ + + +def project_list(app, head, sort="", per_page=20): + head = f"{head + ' ' if head != '' else ''}Projects" + sort + path = urlparse(request.path).path + + page = request.args.get("page", 1, type=int) + + if path in ("/projects/", "/projects"): + ibles = app.global_ibles["/projects"] + total = len(ibles) + else: + if "projects" in path.split("/"): + ibles = [] + + parts = path.split("/") + category = parts[1] + channel = "" if parts[2] == "projects" else parts[2] + + channel_names = unslugify(channel) + + for channel_name in channel_names: + project_ibles, total = projects_search( + app, + category=category, + channel=channel_name, + per_page=per_page, + page=page, + ) + + if project_ibles: + break + + elif "search" in path.split("/"): + ibles = [] + query = ( + request.args.get("q") if request.method == "GET" else request.form["q"] + ) + + project_ibles, total = projects_search( + app, + query=query, + filter_by="", + per_page=per_page, + page=page, + query_by="title,screenName", + ) + + else: + abort(404) + + for ible in project_ibles: + link = f"/{ible['document']['urlString']}" + img = proxy(ible["document"]["coverImageUrl"]) + + title = ible["document"]["title"] + author = ible["document"]["screenName"] + author_link = f"/member/{author}" + + channel = ible["document"]["primaryClassification"] + channel_link = f"/channel/{channel}" + + views = ible["document"]["views"] + favorites = ible["document"]["favorites"] + + ibles.append( + { + "link": link, + "img": img, + "title": title, + "author": author, + "author_link": author_link, + "channel": channel, + "channel_link": channel_link, + "views": views, + "favorites": favorites, + } + ) + + return render_template( + "projects.html", + title=unslugify(head)[0], + ibles=ibles, + path=path, + pagination=get_pagination(request, total, per_page), + ) + + +def category_page(app, name, teachers=False): + path = urlparse(request.path).path + page = request.args.get("page", 1, type=int) + + ibles = [] + + channels = [] + contests = [] + + for channel in app.global_ibles["/projects"]: + if ( + channel["channel"].startswith(name.lower()) + and channel["channel"] not in channels + ): + channels.append(channel["channel"]) + + category_ibles, total = projects_search( + app, category=name, page=page, filter_by="featureFlag:=true" + ) + + for ible in category_ibles: + link = f"/{ible['document']['urlString']}" + img = proxy(ible["document"]["coverImageUrl"]) + + title = ible["document"]["title"] + author = ible["document"]["screenName"] + author_link = f"/member/{author}" + + channel = ible["document"]["primaryClassification"] + channel_link = f"/channel/{channel}" + + views = ible["document"]["views"] + favorites = ible["document"]["favorites"] + + ibles.append( + { + "link": link, + "img": img, + "title": title, + "author": author, + "author_link": author_link, + "channel": channel, + "channel_link": channel_link, + "views": views, + "favorites": favorites, + } + ) + + return render_template( + "category.html", + title=name, + channels=channels, + ibles=ibles, + contests=contests, + path=path, + ) + + +def projects_search( + app, + query="*", + category="", + channel="", + filter_by="", + page=1, + per_page=50, + query_by="title,stepBody,screenName", + sort_by="publishDate:desc", + timeout=5, + typesense_api_key=None, +): + if category: + if filter_by: + filter_by += " && " + filter_by += f"category:={category}" + + if channel: + if filter_by: + filter_by += " && " + filter_by += f"channel:={channel}" + + query = quote(query) + filter_by = quote(filter_by) + + logging.debug( + f"Searching projects with query {query} and filter {filter_by}, page {page}" + ) + + projects_headers = {"x-typesense-api-key": app.typesense_api_key} + + request_args = { + "q": query, + "query_by": query_by, + "page": page, + "sort_by": sort_by, + "include_fields": "title,urlString,coverImageUrl,screenName,favorites,views,primaryClassification,featureFlag,prizeLevel,IMadeItCount", + "filter_by": filter_by, + "per_page": per_page, + } + + args_str = "&".join([f"{key}={value}" for key, value in request_args.items()]) + + projects_request = Request( + f"https://www.instructables.com/api_proxy/search/collections/projects/documents/search?{args_str}", + headers=projects_headers, + ) + + projects_data = urlopen(projects_request, timeout=timeout) + project_obj = json.loads(projects_data.read().decode()) + project_ibles = project_obj["hits"] + + logging.debug(f"Got {len(project_ibles)} projects") + + return project_ibles, project_obj["out_of"] + + +def update_data(app): + logging.debug("Updating data...") + + channels = [] + + sitemap_data = urlopen("https://www.instructables.com/sitemap/") + sitemap_soup = BeautifulSoup(sitemap_data.read().decode(), "html.parser") + main = sitemap_soup.select("div.sitemap-content")[0] + + for group in main.select("div.group-section"): + channels.append(group.select("h2 a")[0].text.lower()) + + app.global_ibles["/projects"] = [] + project_ibles, total = projects_search(app, filter_by="featureFlag:=true") + + while len(app.global_ibles["/projects"]) <= 0: + for ible in project_ibles: + link = f"/{ible['document']['urlString']}" + img = proxy(ible["document"]["coverImageUrl"]) + + title = ible["document"]["title"] + author = ible["document"]["screenName"] + author_link = f"/member/{author}" + + channel = ible["document"]["primaryClassification"] + channel_link = f"/channel/{channel}" + + views = ible["document"]["views"] + favorites = ible["document"]["favorites"] + + app.global_ibles["/projects"].append( + { + "link": link, + "img": img, + "title": title, + "author": author, + "author_link": author_link, + "channel": channel, + "channel_link": channel_link, + "views": views, + "favorites": favorites, + } + )