From 37dde40cbf42fe5ac9d6110d4a6632d61d1cdd75 Mon Sep 17 00:00:00 2001 From: Kumi Date: Fri, 2 May 2025 15:34:48 +0200 Subject: [PATCH] feat: Refactor background tasks --- src/structables/main.py | 41 ++++--------- src/structables/routes/main.py | 7 +++ src/structables/routes/proxy.py | 104 ++++++++++++++++---------------- src/structables/utils/data.py | 25 +++++++- 4 files changed, 95 insertions(+), 82 deletions(-) diff --git a/src/structables/main.py b/src/structables/main.py index ea3d20f..6a639cf 100644 --- a/src/structables/main.py +++ b/src/structables/main.py @@ -1,14 +1,12 @@ #!/usr/bin/env python from flask import Flask -import threading -import time import logging +import time from .config import Config from .routes import init_routes -from .utils.data import update_data -from .routes.proxy import start_cache_cleanup_thread +from .utils.data import maybe_update_data # Configure logging logger = logging.getLogger(__name__) @@ -19,47 +17,30 @@ app.config.from_object(Config) logger.debug("Initializing routes") init_routes(app) logger.debug("Performing initial data update") -update_data(app) +maybe_update_data(app) -def background_update_data(app): - """Runs the update_data function every 5 minutes. - - This replaces the need for a cron job to update the data. - - Args: - app (Flask): The Flask app instance. - """ - logger.debug("Starting background update thread") - while True: - logger.debug("Running scheduled data update") - update_data(app) - logger.debug("Data update complete, sleeping for 5 minutes") - time.sleep(300) def main(): if app.config["DEBUG"]: logging.basicConfig( level=logging.DEBUG, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) else: logging.basicConfig( level=logging.INFO, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) - - logger.debug("Starting background update thread") - threading.Thread(target=background_update_data, args=(app,), daemon=True).start() - - # Start the cache cleanup thread - start_cache_cleanup_thread(app) - - logger.info(f"Starting Structables on {app.config['LISTEN_HOST']}:{app.config['PORT']}") + + logger.info( + f"Starting Structables on {app.config['LISTEN_HOST']}:{app.config['PORT']}" + ) app.run( port=app.config["PORT"], host=app.config["LISTEN_HOST"], debug=app.config["DEBUG"], ) + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/structables/routes/main.py b/src/structables/routes/main.py index ffcf2ad..822073b 100644 --- a/src/structables/routes/main.py +++ b/src/structables/routes/main.py @@ -6,13 +6,16 @@ from urllib.parse import quote from werkzeug.exceptions import InternalServerError from markdown2 import Markdown from traceback import print_exc + import pathlib import json import logging +import random from ..utils.data import update_data from ..utils.helpers import explore_lists, proxy from .category import project_list +from ..utils.data import maybe_update_data logger = logging.getLogger(__name__) @@ -22,6 +25,10 @@ def init_main_routes(app): def route_explore(): logger.debug("Rendering explore page") + # Occasionally trigger data update (every 20th request) + if random.randint(1, 20) == 1: + maybe_update_data(app) + try: logger.debug("Fetching data from instructables.com") data = urlopen("https://www.instructables.com/") diff --git a/src/structables/routes/proxy.py b/src/structables/routes/proxy.py index fe0741c..7b1df5b 100644 --- a/src/structables/routes/proxy.py +++ b/src/structables/routes/proxy.py @@ -7,14 +7,12 @@ import logging import os import hashlib import time -import threading import shutil -import random logger = logging.getLogger(__name__) -# Cache cleanup thread reference -cache_cleanup_thread = None +# Track last cache cleanup time +last_cache_cleanup = 0 def get_cache_path(app, url): @@ -95,24 +93,35 @@ def get_content_type(cache_path): return "application/octet-stream" -def cache_cleanup(app): - """Clean up the cache directory to stay within size limits. - - This function removes the oldest files first until the cache size - is below the maximum size. +def maybe_cleanup_cache(app): + """Clean up the cache directory if it's time to do so. Args: app: The Flask app instance. """ + global last_cache_cleanup + # If caching is disabled, don't do anything if not app.config["CACHE_ENABLED"]: return + # Check if it's time to run cleanup + current_time = time.time() + cleanup_interval = app.config["CACHE_CLEANUP_INTERVAL"] + + if current_time - last_cache_cleanup < cleanup_interval: + logger.debug( + f"Cache cleanup skipped. Time since last cleanup: {current_time - last_cache_cleanup:.2f} seconds" + ) + return + logger.debug("Starting cache cleanup") + last_cache_cleanup = current_time try: cache_dir = app.config["CACHE_DIR"] max_size = app.config["CACHE_MAX_SIZE"] + max_age = app.config["CACHE_MAX_AGE"] # Get all cache files with their modification times cache_files = [] @@ -121,6 +130,10 @@ def cache_cleanup(app): for filename in os.listdir(cache_dir): file_path = os.path.join(cache_dir, filename) if os.path.isfile(file_path): + # Skip metadata files in the count + if file_path.endswith(".meta"): + continue + file_size = os.path.getsize(file_path) file_time = os.path.getmtime(file_path) total_size += file_size @@ -128,7 +141,34 @@ def cache_cleanup(app): logger.debug(f"Current cache size: {total_size / (1024 * 1024):.2f} MB") - # If we're over the size limit, remove oldest files first + # First, remove expired files + current_time = time.time() + expired_files = [ + (path, mtime, size) + for path, mtime, size in cache_files + if current_time - mtime > max_age + ] + + for file_path, _, file_size in expired_files: + try: + os.remove(file_path) + # Also remove metadata file if it exists + meta_path = file_path + ".meta" + if os.path.exists(meta_path): + os.remove(meta_path) + total_size -= file_size + logger.debug(f"Removed expired cache file: {file_path}") + except OSError: + logger.warning(f"Failed to remove expired cache file: {file_path}") + + # Remove files from the list that we've already deleted + cache_files = [ + (path, mtime, size) + for path, mtime, size in cache_files + if (path, mtime, size) not in expired_files + ] + + # If we're still over the size limit, remove oldest files first if total_size > max_size: logger.debug("Cache size exceeds limit, cleaning up") # Sort by modification time (oldest first) @@ -147,7 +187,7 @@ def cache_cleanup(app): os.remove(meta_path) total_size -= file_size - logger.debug(f"Removed cache file: {file_path}") + logger.debug(f"Removed old cache file: {file_path}") except OSError: logger.warning(f"Failed to remove cache file: {file_path}") @@ -159,40 +199,6 @@ def cache_cleanup(app): logger.error(f"Error during cache cleanup: {str(e)}") -def start_cache_cleanup_thread(app): - """Start a background thread to periodically clean up the cache. - - Args: - app: The Flask app instance. - """ - global cache_cleanup_thread - - # If thread is already running, don't start another one - if cache_cleanup_thread is not None and cache_cleanup_thread.is_alive(): - return - - # If caching is disabled, don't start the thread - if not app.config["CACHE_ENABLED"]: - logger.debug("Caching is disabled, not starting cache cleanup thread") - return - - def cleanup_worker(): - while True: - try: - with app.app_context(): - cache_cleanup(app) - cleanup_interval = app.config["CACHE_CLEANUP_INTERVAL"] - time.sleep(cleanup_interval) - except Exception as e: - logger.error(f"Error in cache cleanup worker: {str(e)}") - # Sleep a bit to avoid tight loop in case of recurring errors - time.sleep(60) - - cache_cleanup_thread = threading.Thread(target=cleanup_worker, daemon=True) - cache_cleanup_thread.start() - logger.debug("Started cache cleanup background thread") - - def init_proxy_routes(app): # Create cache directory if it doesn't exist and caching is enabled if app.config["CACHE_ENABLED"]: @@ -216,12 +222,8 @@ def init_proxy_routes(app): logger.debug(f"Proxy request for URL: {url}, filename: {filename}") - # Check if the cache cleanup thread is running - if cache_cleanup_thread is None: - # Use every 100th request to trigger cleanup - if random.randint(1, 100) == 1: - logger.debug("Triggering cache cleanup") - cache_cleanup(app) + # Clean up the cache if needed + maybe_cleanup_cache(app) if url is not None: if url.startswith("https://cdn.instructables.com/") or url.startswith( diff --git a/src/structables/utils/data.py b/src/structables/utils/data.py index 6391e76..97df0dd 100644 --- a/src/structables/utils/data.py +++ b/src/structables/utils/data.py @@ -1,10 +1,14 @@ from urllib.request import urlopen import logging +import time from bs4 import BeautifulSoup from .helpers import proxy, projects_search logger = logging.getLogger(__name__) +# Track the last data update time +last_data_update = 0 + def update_data(app): """Update the application's cached data. @@ -73,4 +77,23 @@ def update_data(app): logger.debug(f"Updated global projects list with {len(app.global_ibles['/projects'])} projects") logger.debug("Data update completed successfully") except Exception as e: - logger.error(f"Error updating data: {str(e)}") \ No newline at end of file + logger.error(f"Error updating data: {str(e)}") + + +def maybe_update_data(app): + """Updates the data if it's time to do so. + + This replaces the background thread with a request-triggered update. + + Args: + app (Flask): The Flask app instance. + """ + global last_data_update + current_time = time.time() + + # Update every 5 minutes (300 seconds) + if current_time - last_data_update >= 300: + logger.debug("Running scheduled data update") + update_data(app) + last_data_update = current_time + logger.debug("Data update complete") \ No newline at end of file