diff --git a/README.md b/README.md index 78c35e0..e32dd3b 100644 --- a/README.md +++ b/README.md @@ -14,20 +14,24 @@ An open source alternative front-end to Instructables. This is a fork of + | URL | Provided by | Country | Notes | | ---------------------------------------------------------------------- | ---------------------------------------------- | ---------------- | ------------- | -| [structables.private.coffee](https://structables.private.coffee) | [Private.coffee](https://private.coffee) | Austria πŸ‡¦πŸ‡Ή πŸ‡ͺπŸ‡Ί | Main instance | -| [structables.bloat.cat](https://structables.bloat.cat) | [Bloat.cat](https://bloat.cat) | Germany πŸ‡©πŸ‡ͺ πŸ‡ͺπŸ‡Ί | | -| [structables.darkness.services](https://structables.darkness.services) | [Darkness.services](https://darkness.services) | United States πŸ‡ΊπŸ‡Έ | | +| [structables.private.coffee](https://structables.private.coffee) | [Private.coffee](https://private.coffee) | Austria πŸ‡¦πŸ‡Ή πŸ‡ͺπŸ‡Ί | Main instance | +| [structables.bloat.cat](https://structables.bloat.cat) | [Bloat.cat](https://bloat.cat) | Germany πŸ‡©πŸ‡ͺ πŸ‡ͺπŸ‡Ί | | +| [structables.darkness.services](https://structables.darkness.services) | [Darkness.services](https://darkness.services) | United States πŸ‡ΊπŸ‡Έ | | + ### Tor Hidden Services + | URL | Provided by | Country | Notes | | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------- | ---------------- | ------------- | -| [structables.coffee2m3bjsrrqqycx6ghkxrnejl2q6nl7pjw2j4clchjj6uk5zozad.onion](http://structables.coffee2m3bjsrrqqycx6ghkxrnejl2q6nl7pjw2j4clchjj6uk5zozad.onion) | [Private.coffee](https://private.coffee) | Austria πŸ‡¦πŸ‡Ή πŸ‡ͺπŸ‡Ί | Main instance | -| [structables.darknessrdor43qkl2ngwitj72zdavfz2cead4t5ed72bybgauww5lyd.onion](http://structables.darknessrdor43qkl2ngwitj72zdavfz2cead4t5ed72bybgauww5lyd.onion) | [Darkness.services](https://darkness.services) | United States πŸ‡ΊπŸ‡Έ | | +| [structables.coffee2m3bjsrrqqycx6ghkxrnejl2q6nl7pjw2j4clchjj6uk5zozad.onion](http://structables.coffee2m3bjsrrqqycx6ghkxrnejl2q6nl7pjw2j4clchjj6uk5zozad.onion) | [Private.coffee](https://private.coffee) | Austria πŸ‡¦πŸ‡Ή πŸ‡ͺπŸ‡Ί | Main instance | +| [structables.darknessrdor43qkl2ngwitj72zdavfz2cead4t5ed72bybgauww5lyd.onion](http://structables.darknessrdor43qkl2ngwitj72zdavfz2cead4t5ed72bybgauww5lyd.onion) | [Darkness.services](https://darkness.services) | United States πŸ‡ΊπŸ‡Έ | | + ### Adding Your Instance @@ -86,6 +90,11 @@ Structables supports the use of the following environment variables for configur - `STRUCTABLES_PRIVACY_FILE`: The path to a text file or Markdown file (with .md suffix) to use for the Privacy Policy page (if unset, try `privacy.txt` or `privacy.md` in the working directory, or fall back to a generic message) - `STRUCTABLES_DEBUG`: If set, log additional debug information to stdout - `STRUCTABLES_THEME`: Allows selecting a theme for the frontend. Currently, only `dark` and `light` are supported. If not set, it will be automatically detected based on the user's system settings, and a toggle will be provided in the header. +- `STRUCTABLES_CACHE_ENABLED`: Whether to enable caching of proxied content (default: true). Set to "false" or "0" to disable caching. +- `STRUCTABLES_CACHE_DIR`: The directory to use for caching proxied content (default: `structables_cache` within the temporary directory as returned by `tempfile.gettempdir()`) +- `STRUCTABLES_CACHE_MAX_AGE`: The maximum age of cached content in seconds before it's considered stale (default: 604800 seconds, or 1 week) +- `STRUCTABLES_CACHE_MAX_SIZE`: The maximum size of the cache directory in bytes (default: 1073741824 bytes, or 1GB) +- `STRUCTABLES_CACHE_CLEANUP_INTERVAL`: How often to run the cache cleanup process in seconds (default: 3600 seconds, or 1 hour) ## License diff --git a/requirements-dev.txt b/requirements-dev.txt index 555ca07..9d0a31a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,6 @@ ruff black isort -mypy \ No newline at end of file +mypy +types-beautifulsoup4 +types-colorama \ No newline at end of file diff --git a/src/structables/config.py b/src/structables/config.py index b771d8a..a76ea98 100644 --- a/src/structables/config.py +++ b/src/structables/config.py @@ -1,4 +1,8 @@ import os +import tempfile + +from .utils.helpers import get_typesense_api_key + class Config: DEBUG = os.environ.get("FLASK_DEBUG", os.environ.get("STRUCTABLES_DEBUG", False)) @@ -8,7 +12,33 @@ class Config: UNSAFE = os.environ.get("STRUCTABLES_UNSAFE", False) PRIVACY_FILE = os.environ.get("STRUCTABLES_PRIVACY_FILE") THEME = os.environ.get("STRUCTABLES_THEME", "auto") + TYPESENSE_API_KEY = get_typesense_api_key() + + # Cache settings + CACHE_ENABLED = os.environ.get("STRUCTABLES_CACHE_ENABLED", "true").lower() not in ( + "false", + "0", + "no", + "off", + "n", + ) + CACHE_DIR = os.environ.get("STRUCTABLES_CACHE_DIR") + + if CACHE_DIR is None: + CACHE_DIR = os.path.join( + tempfile.gettempdir(), "structables_cache" + ) + + CACHE_MAX_AGE = int( + os.environ.get("STRUCTABLES_CACHE_MAX_AGE", 60 * 60 * 24 * 7) + ) # 1 week default + CACHE_MAX_SIZE = int( + os.environ.get("STRUCTABLES_CACHE_MAX_SIZE", 1024 * 1024 * 1024) + ) # 1GB default + CACHE_CLEANUP_INTERVAL = int( + os.environ.get("STRUCTABLES_CACHE_CLEANUP_INTERVAL", 60 * 60) + ) # 1 hour default @staticmethod def init_app(app): - pass \ No newline at end of file + pass diff --git a/src/structables/main.py b/src/structables/main.py index 32f0925..ea3d20f 100644 --- a/src/structables/main.py +++ b/src/structables/main.py @@ -8,21 +8,19 @@ import logging from .config import Config from .routes import init_routes from .utils.data import update_data -from .utils.helpers import get_typesense_api_key +from .routes.proxy import start_cache_cleanup_thread # Configure logging logger = logging.getLogger(__name__) app = Flask(__name__, template_folder="templates", static_folder="static") app.config.from_object(Config) -app.typesense_api_key = get_typesense_api_key() logger.debug("Initializing routes") init_routes(app) logger.debug("Performing initial data update") update_data(app) - def background_update_data(app): """Runs the update_data function every 5 minutes. @@ -38,31 +36,30 @@ def background_update_data(app): logger.debug("Data update complete, sleeping for 5 minutes") time.sleep(300) - def main(): if app.config["DEBUG"]: logging.basicConfig( level=logging.DEBUG, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) else: logging.basicConfig( level=logging.INFO, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) - + logger.debug("Starting background update thread") threading.Thread(target=background_update_data, args=(app,), daemon=True).start() - - logger.info( - f"Starting Structables on {app.config['LISTEN_HOST']}:{app.config['PORT']}" - ) + + # Start the cache cleanup thread + start_cache_cleanup_thread(app) + + logger.info(f"Starting Structables on {app.config['LISTEN_HOST']}:{app.config['PORT']}") app.run( port=app.config["PORT"], host=app.config["LISTEN_HOST"], debug=app.config["DEBUG"], ) - if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/src/structables/routes/contest.py b/src/structables/routes/contest.py index f0f9b4d..b9b99da 100644 --- a/src/structables/routes/contest.py +++ b/src/structables/routes/contest.py @@ -74,8 +74,8 @@ def init_contest_routes(app): ) def get_entries(contest): - base_url = f"https://www.instructables.com/api_proxy/search/collections/projects/documents/search" - headers = {"x-typesense-api-key": app.typesense_api_key} + base_url = "https://www.instructables.com/api_proxy/search/collections/projects/documents/search" + headers = {"x-typesense-api-key": app.config["TYPESENSE_API_KEY"]} page, per_page = 1, 100 all_entries = [] @@ -177,7 +177,7 @@ def init_contest_routes(app): "https://www.instructables.com/json-api/getCurrentContests?limit=50&offset=0" ) data = json.loads(response.read().decode()) - logger.debug(f"Received current contests data") + logger.debug("Received current contests data") except HTTPError as e: logger.error(f"HTTP error fetching current contests: {e.code}") abort(e.code) diff --git a/src/structables/routes/main.py b/src/structables/routes/main.py index da1178c..ffcf2ad 100644 --- a/src/structables/routes/main.py +++ b/src/structables/routes/main.py @@ -134,7 +134,7 @@ def init_main_routes(app): f"https://www.instructables.com/json-api/showInstructableModel?urlString={article}" ) data = json.loads(data.read().decode()) - logger.debug(f"Successfully fetched article data") + logger.debug("Successfully fetched article data") except HTTPError as e: logger.error(f"HTTP error fetching article: {e.code}") abort(e.code) diff --git a/src/structables/routes/proxy.py b/src/structables/routes/proxy.py index bf93073..004e21d 100644 --- a/src/structables/routes/proxy.py +++ b/src/structables/routes/proxy.py @@ -4,15 +4,215 @@ from urllib.parse import unquote from urllib.error import HTTPError from urllib.request import urlopen import logging +import os +import hashlib +import time +import threading +import shutil logger = logging.getLogger(__name__) +# Cache cleanup thread reference +cache_cleanup_thread = None + + +def get_cache_path(app, url): + """Generate a cache file path for a URL. + + Args: + app: The Flask app instance. + url (str): The URL to cache. + + Returns: + str: The path to the cache file. + """ + # Create a hash of the URL to use as the filename + url_hash = hashlib.sha256(url.encode()).hexdigest() + cache_dir = app.config["CACHE_DIR"] + return os.path.join(cache_dir, url_hash) + + +def is_cached(app, url): + """Check if a URL is cached and not expired. + + Args: + app: The Flask app instance. + url (str): The URL to check. + + Returns: + bool: True if the URL is cached and not expired, False otherwise. + """ + # If caching is disabled, always return False + if not app.config["CACHE_ENABLED"]: + return False + + cache_path = get_cache_path(app, url) + + # Check if the file exists + if not os.path.exists(cache_path): + return False + + # Check if the cache has expired + cache_time = os.path.getmtime(cache_path) + max_age = app.config["CACHE_MAX_AGE"] + if time.time() - cache_time > max_age: + # Cache has expired, remove it + try: + os.remove(cache_path) + # Also remove metadata file if it exists + meta_path = cache_path + ".meta" + if os.path.exists(meta_path): + os.remove(meta_path) + return False + except OSError: + logger.warning(f"Failed to remove expired cache file: {cache_path}") + return False + + # Cache exists and is not expired + return True + + +def get_content_type(cache_path): + """Get the content type from a cache file. + + Args: + cache_path (str): The path to the cache file. + + Returns: + str: The content type, or 'application/octet-stream' if not found. + """ + meta_path = cache_path + ".meta" + if os.path.exists(meta_path): + try: + with open(meta_path, "r") as f: + return f.read().strip() + except OSError: + logger.warning( + f"Failed to read content type from cache metadata: {meta_path}" + ) + + return "application/octet-stream" + + +def cache_cleanup(app): + """Clean up the cache directory to stay within size limits. + + This function removes the oldest files first until the cache size + is below the maximum size. + + Args: + app: The Flask app instance. + """ + # If caching is disabled, don't do anything + if not app.config["CACHE_ENABLED"]: + return + + logger.debug("Starting cache cleanup") + + try: + cache_dir = app.config["CACHE_DIR"] + max_size = app.config["CACHE_MAX_SIZE"] + + # Get all cache files with their modification times + cache_files = [] + total_size = 0 + + for filename in os.listdir(cache_dir): + file_path = os.path.join(cache_dir, filename) + if os.path.isfile(file_path): + file_size = os.path.getsize(file_path) + file_time = os.path.getmtime(file_path) + total_size += file_size + cache_files.append((file_path, file_time, file_size)) + + logger.debug(f"Current cache size: {total_size / (1024 * 1024):.2f} MB") + + # If we're over the size limit, remove oldest files first + if total_size > max_size: + logger.debug("Cache size exceeds limit, cleaning up") + # Sort by modification time (oldest first) + cache_files.sort(key=lambda x: x[1]) + + # Remove files until we're under the limit + for file_path, _, file_size in cache_files: + if total_size <= max_size: + break + + try: + os.remove(file_path) + # Also remove metadata file if it exists + meta_path = file_path + ".meta" + if os.path.exists(meta_path): + os.remove(meta_path) + + total_size -= file_size + logger.debug(f"Removed cache file: {file_path}") + except OSError: + logger.warning(f"Failed to remove cache file: {file_path}") + + logger.debug( + f"Cache cleanup complete. New size: {total_size / (1024 * 1024):.2f} MB" + ) + + except Exception as e: + logger.error(f"Error during cache cleanup: {str(e)}") + + +def start_cache_cleanup_thread(app): + """Start a background thread to periodically clean up the cache. + + Args: + app: The Flask app instance. + """ + global cache_cleanup_thread + + # If thread is already running, don't start another one + if cache_cleanup_thread is not None and cache_cleanup_thread.is_alive(): + return + + # If caching is disabled, don't start the thread + if not app.config["CACHE_ENABLED"]: + logger.debug("Caching is disabled, not starting cache cleanup thread") + return + + def cleanup_worker(): + while True: + try: + with app.app_context(): + cache_cleanup(app) + cleanup_interval = app.config["CACHE_CLEANUP_INTERVAL"] + time.sleep(cleanup_interval) + except Exception as e: + logger.error(f"Error in cache cleanup worker: {str(e)}") + # Sleep a bit to avoid tight loop in case of recurring errors + time.sleep(60) + + cache_cleanup_thread = threading.Thread(target=cleanup_worker, daemon=True) + cache_cleanup_thread.start() + logger.debug("Started cache cleanup background thread") + + def init_proxy_routes(app): + # Create cache directory if it doesn't exist and caching is enabled + if app.config["CACHE_ENABLED"]: + cache_dir = app.config["CACHE_DIR"] + os.makedirs(cache_dir, exist_ok=True) + logger.debug(f"Cache directory: {cache_dir}") + logger.debug(f"Cache max age: {app.config['CACHE_MAX_AGE']} seconds") + logger.debug( + f"Cache max size: {app.config['CACHE_MAX_SIZE'] / (1024 * 1024):.2f} MB" + ) + logger.debug( + f"Cache cleanup interval: {app.config['CACHE_CLEANUP_INTERVAL']} seconds" + ) + else: + logger.debug("Caching is disabled") + @app.route("/proxy/") def route_proxy(): url = request.args.get("url") filename = request.args.get("filename") - + logger.debug(f"Proxy request for URL: {url}, filename: {filename}") if url is not None: @@ -20,27 +220,102 @@ def init_proxy_routes(app): "https://content.instructables.com/" ): logger.debug(f"Valid proxy URL: {url}") + unquoted_url = unquote(url) - def generate(): - # Subfunction to allow streaming the data instead of - # downloading all of it at once - try: - logger.debug(f"Opening connection to {url}") - with urlopen(unquote(url)) as data: - logger.debug("Connection established, streaming data") + # Check if the content is already cached + if is_cached(app, unquoted_url): + logger.debug(f"Serving cached content for: {unquoted_url}") + cache_path = get_cache_path(app, unquoted_url) + content_type = get_content_type(cache_path) + + def generate_from_cache(): + with open(cache_path, "rb") as f: while True: - chunk = data.read(1024 * 1024) + chunk = f.read(1024 * 1024) if not chunk: break yield chunk - logger.debug("Finished streaming data") + + headers = dict() + if filename is not None: + headers["Content-Disposition"] = ( + f'attachment; filename="{filename}"' + ) + + return Response( + generate_from_cache(), + content_type=content_type, + headers=headers, + ) + + # Content is not cached or caching is disabled, fetch it + def generate_and_maybe_cache(): + try: + logger.debug(f"Opening connection to {unquoted_url}") + with urlopen(unquoted_url) as data: + logger.debug("Connection established, streaming data") + + # If caching is enabled, cache the content + if app.config["CACHE_ENABLED"]: + cache_path = get_cache_path(app, unquoted_url) + temp_path = cache_path + ".tmp" + with open(temp_path, "wb") as f: + while True: + chunk = data.read(1024 * 1024) + if not chunk: + break + f.write(chunk) + yield chunk + + # Save the content type + try: + content_type = data.headers["content-type"] + with open(cache_path + ".meta", "w") as f: + f.write(content_type) + except (KeyError, OSError): + logger.warning( + f"Failed to save content type for: {unquoted_url}" + ) + + # Rename the temporary file to the final cache file + try: + os.rename(temp_path, cache_path) + logger.debug( + f"Successfully cached content for: {unquoted_url}" + ) + except OSError: + logger.warning( + f"Failed to rename temporary cache file: {temp_path}" + ) + # Try to copy and delete instead + try: + shutil.copy2(temp_path, cache_path) + os.remove(temp_path) + logger.debug( + f"Successfully cached content using copy method: {unquoted_url}" + ) + except OSError: + logger.error( + f"Failed to cache content: {unquoted_url}" + ) + else: + # If caching is disabled, just stream the data + while True: + chunk = data.read(1024 * 1024) + if not chunk: + break + yield chunk + except HTTPError as e: logger.error(f"HTTP error during streaming: {e.code}") abort(e.code) + except Exception as e: + logger.error(f"Error fetching content: {str(e)}") + abort(500) try: - logger.debug(f"Getting content type for {url}") - with urlopen(unquote(url)) as data: + logger.debug(f"Getting content type for {unquoted_url}") + with urlopen(unquoted_url) as data: content_type = data.headers["content-type"] logger.debug(f"Content type: {content_type}") except HTTPError as e: @@ -51,14 +326,17 @@ def init_proxy_routes(app): raise InternalServerError() headers = dict() - if filename is not None: headers["Content-Disposition"] = ( f'attachment; filename="{filename}"' ) logger.debug(f"Added Content-Disposition header for {filename}") - return Response(generate(), content_type=content_type, headers=headers) + return Response( + generate_and_maybe_cache(), + content_type=content_type, + headers=headers, + ) else: logger.warning(f"Invalid proxy URL: {url}") raise BadRequest() @@ -70,11 +348,11 @@ def init_proxy_routes(app): def route_iframe(): url = request.args.get("url") url = unquote(url) - + logger.debug(f"iframe request for URL: {url}") - + if url is not None: return render_template("iframe.html", url=url) else: logger.warning("No URL provided for iframe") - raise BadRequest() \ No newline at end of file + raise BadRequest() diff --git a/src/structables/utils/helpers.py b/src/structables/utils/helpers.py index be4c68c..f6cb26e 100644 --- a/src/structables/utils/helpers.py +++ b/src/structables/utils/helpers.py @@ -519,7 +519,7 @@ def projects_search( logger.debug(f"Searching projects: query='{query}', filter='{filter_by}', page={page}, per_page={per_page}") - projects_headers = {"x-typesense-api-key": app.typesense_api_key} + projects_headers = {"x-typesense-api-key": app.config["TYPESENSE_API_KEY"]} request_args = { "q": query,