feat: Refactor background tasks

This commit is contained in:
Kumi 2025-05-02 15:34:48 +02:00
parent 7e3e88cb74
commit 37dde40cbf
Signed by: kumi
GPG key ID: ECBCC9082395383F
4 changed files with 95 additions and 82 deletions

View file

@ -1,14 +1,12 @@
#!/usr/bin/env python #!/usr/bin/env python
from flask import Flask from flask import Flask
import threading
import time
import logging import logging
import time
from .config import Config from .config import Config
from .routes import init_routes from .routes import init_routes
from .utils.data import update_data from .utils.data import maybe_update_data
from .routes.proxy import start_cache_cleanup_thread
# Configure logging # Configure logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -19,47 +17,30 @@ app.config.from_object(Config)
logger.debug("Initializing routes") logger.debug("Initializing routes")
init_routes(app) init_routes(app)
logger.debug("Performing initial data update") logger.debug("Performing initial data update")
update_data(app) maybe_update_data(app)
def background_update_data(app):
"""Runs the update_data function every 5 minutes.
This replaces the need for a cron job to update the data.
Args:
app (Flask): The Flask app instance.
"""
logger.debug("Starting background update thread")
while True:
logger.debug("Running scheduled data update")
update_data(app)
logger.debug("Data update complete, sleeping for 5 minutes")
time.sleep(300)
def main(): def main():
if app.config["DEBUG"]: if app.config["DEBUG"]:
logging.basicConfig( logging.basicConfig(
level=logging.DEBUG, level=logging.DEBUG,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
) )
else: else:
logging.basicConfig( logging.basicConfig(
level=logging.INFO, level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
) )
logger.debug("Starting background update thread") logger.info(
threading.Thread(target=background_update_data, args=(app,), daemon=True).start() f"Starting Structables on {app.config['LISTEN_HOST']}:{app.config['PORT']}"
)
# Start the cache cleanup thread
start_cache_cleanup_thread(app)
logger.info(f"Starting Structables on {app.config['LISTEN_HOST']}:{app.config['PORT']}")
app.run( app.run(
port=app.config["PORT"], port=app.config["PORT"],
host=app.config["LISTEN_HOST"], host=app.config["LISTEN_HOST"],
debug=app.config["DEBUG"], debug=app.config["DEBUG"],
) )
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View file

@ -6,13 +6,16 @@ from urllib.parse import quote
from werkzeug.exceptions import InternalServerError from werkzeug.exceptions import InternalServerError
from markdown2 import Markdown from markdown2 import Markdown
from traceback import print_exc from traceback import print_exc
import pathlib import pathlib
import json import json
import logging import logging
import random
from ..utils.data import update_data from ..utils.data import update_data
from ..utils.helpers import explore_lists, proxy from ..utils.helpers import explore_lists, proxy
from .category import project_list from .category import project_list
from ..utils.data import maybe_update_data
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -22,6 +25,10 @@ def init_main_routes(app):
def route_explore(): def route_explore():
logger.debug("Rendering explore page") logger.debug("Rendering explore page")
# Occasionally trigger data update (every 20th request)
if random.randint(1, 20) == 1:
maybe_update_data(app)
try: try:
logger.debug("Fetching data from instructables.com") logger.debug("Fetching data from instructables.com")
data = urlopen("https://www.instructables.com/") data = urlopen("https://www.instructables.com/")

View file

@ -7,14 +7,12 @@ import logging
import os import os
import hashlib import hashlib
import time import time
import threading
import shutil import shutil
import random
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Cache cleanup thread reference # Track last cache cleanup time
cache_cleanup_thread = None last_cache_cleanup = 0
def get_cache_path(app, url): def get_cache_path(app, url):
@ -95,24 +93,35 @@ def get_content_type(cache_path):
return "application/octet-stream" return "application/octet-stream"
def cache_cleanup(app): def maybe_cleanup_cache(app):
"""Clean up the cache directory to stay within size limits. """Clean up the cache directory if it's time to do so.
This function removes the oldest files first until the cache size
is below the maximum size.
Args: Args:
app: The Flask app instance. app: The Flask app instance.
""" """
global last_cache_cleanup
# If caching is disabled, don't do anything # If caching is disabled, don't do anything
if not app.config["CACHE_ENABLED"]: if not app.config["CACHE_ENABLED"]:
return return
# Check if it's time to run cleanup
current_time = time.time()
cleanup_interval = app.config["CACHE_CLEANUP_INTERVAL"]
if current_time - last_cache_cleanup < cleanup_interval:
logger.debug(
f"Cache cleanup skipped. Time since last cleanup: {current_time - last_cache_cleanup:.2f} seconds"
)
return
logger.debug("Starting cache cleanup") logger.debug("Starting cache cleanup")
last_cache_cleanup = current_time
try: try:
cache_dir = app.config["CACHE_DIR"] cache_dir = app.config["CACHE_DIR"]
max_size = app.config["CACHE_MAX_SIZE"] max_size = app.config["CACHE_MAX_SIZE"]
max_age = app.config["CACHE_MAX_AGE"]
# Get all cache files with their modification times # Get all cache files with their modification times
cache_files = [] cache_files = []
@ -121,6 +130,10 @@ def cache_cleanup(app):
for filename in os.listdir(cache_dir): for filename in os.listdir(cache_dir):
file_path = os.path.join(cache_dir, filename) file_path = os.path.join(cache_dir, filename)
if os.path.isfile(file_path): if os.path.isfile(file_path):
# Skip metadata files in the count
if file_path.endswith(".meta"):
continue
file_size = os.path.getsize(file_path) file_size = os.path.getsize(file_path)
file_time = os.path.getmtime(file_path) file_time = os.path.getmtime(file_path)
total_size += file_size total_size += file_size
@ -128,7 +141,34 @@ def cache_cleanup(app):
logger.debug(f"Current cache size: {total_size / (1024 * 1024):.2f} MB") logger.debug(f"Current cache size: {total_size / (1024 * 1024):.2f} MB")
# If we're over the size limit, remove oldest files first # First, remove expired files
current_time = time.time()
expired_files = [
(path, mtime, size)
for path, mtime, size in cache_files
if current_time - mtime > max_age
]
for file_path, _, file_size in expired_files:
try:
os.remove(file_path)
# Also remove metadata file if it exists
meta_path = file_path + ".meta"
if os.path.exists(meta_path):
os.remove(meta_path)
total_size -= file_size
logger.debug(f"Removed expired cache file: {file_path}")
except OSError:
logger.warning(f"Failed to remove expired cache file: {file_path}")
# Remove files from the list that we've already deleted
cache_files = [
(path, mtime, size)
for path, mtime, size in cache_files
if (path, mtime, size) not in expired_files
]
# If we're still over the size limit, remove oldest files first
if total_size > max_size: if total_size > max_size:
logger.debug("Cache size exceeds limit, cleaning up") logger.debug("Cache size exceeds limit, cleaning up")
# Sort by modification time (oldest first) # Sort by modification time (oldest first)
@ -147,7 +187,7 @@ def cache_cleanup(app):
os.remove(meta_path) os.remove(meta_path)
total_size -= file_size total_size -= file_size
logger.debug(f"Removed cache file: {file_path}") logger.debug(f"Removed old cache file: {file_path}")
except OSError: except OSError:
logger.warning(f"Failed to remove cache file: {file_path}") logger.warning(f"Failed to remove cache file: {file_path}")
@ -159,40 +199,6 @@ def cache_cleanup(app):
logger.error(f"Error during cache cleanup: {str(e)}") logger.error(f"Error during cache cleanup: {str(e)}")
def start_cache_cleanup_thread(app):
"""Start a background thread to periodically clean up the cache.
Args:
app: The Flask app instance.
"""
global cache_cleanup_thread
# If thread is already running, don't start another one
if cache_cleanup_thread is not None and cache_cleanup_thread.is_alive():
return
# If caching is disabled, don't start the thread
if not app.config["CACHE_ENABLED"]:
logger.debug("Caching is disabled, not starting cache cleanup thread")
return
def cleanup_worker():
while True:
try:
with app.app_context():
cache_cleanup(app)
cleanup_interval = app.config["CACHE_CLEANUP_INTERVAL"]
time.sleep(cleanup_interval)
except Exception as e:
logger.error(f"Error in cache cleanup worker: {str(e)}")
# Sleep a bit to avoid tight loop in case of recurring errors
time.sleep(60)
cache_cleanup_thread = threading.Thread(target=cleanup_worker, daemon=True)
cache_cleanup_thread.start()
logger.debug("Started cache cleanup background thread")
def init_proxy_routes(app): def init_proxy_routes(app):
# Create cache directory if it doesn't exist and caching is enabled # Create cache directory if it doesn't exist and caching is enabled
if app.config["CACHE_ENABLED"]: if app.config["CACHE_ENABLED"]:
@ -216,12 +222,8 @@ def init_proxy_routes(app):
logger.debug(f"Proxy request for URL: {url}, filename: {filename}") logger.debug(f"Proxy request for URL: {url}, filename: {filename}")
# Check if the cache cleanup thread is running # Clean up the cache if needed
if cache_cleanup_thread is None: maybe_cleanup_cache(app)
# Use every 100th request to trigger cleanup
if random.randint(1, 100) == 1:
logger.debug("Triggering cache cleanup")
cache_cleanup(app)
if url is not None: if url is not None:
if url.startswith("https://cdn.instructables.com/") or url.startswith( if url.startswith("https://cdn.instructables.com/") or url.startswith(

View file

@ -1,10 +1,14 @@
from urllib.request import urlopen from urllib.request import urlopen
import logging import logging
import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from .helpers import proxy, projects_search from .helpers import proxy, projects_search
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Track the last data update time
last_data_update = 0
def update_data(app): def update_data(app):
"""Update the application's cached data. """Update the application's cached data.
@ -73,4 +77,23 @@ def update_data(app):
logger.debug(f"Updated global projects list with {len(app.global_ibles['/projects'])} projects") logger.debug(f"Updated global projects list with {len(app.global_ibles['/projects'])} projects")
logger.debug("Data update completed successfully") logger.debug("Data update completed successfully")
except Exception as e: except Exception as e:
logger.error(f"Error updating data: {str(e)}") logger.error(f"Error updating data: {str(e)}")
def maybe_update_data(app):
"""Updates the data if it's time to do so.
This replaces the background thread with a request-triggered update.
Args:
app (Flask): The Flask app instance.
"""
global last_data_update
current_time = time.time()
# Update every 5 minutes (300 seconds)
if current_time - last_data_update >= 300:
logger.debug("Running scheduled data update")
update_data(app)
last_data_update = current_time
logger.debug("Data update complete")