feat: Refactor background tasks

This commit is contained in:
Kumi 2025-05-02 15:34:48 +02:00
parent 7e3e88cb74
commit 37dde40cbf
Signed by: kumi
GPG key ID: ECBCC9082395383F
4 changed files with 95 additions and 82 deletions

View file

@ -1,14 +1,12 @@
#!/usr/bin/env python
from flask import Flask
import threading
import time
import logging
import time
from .config import Config
from .routes import init_routes
from .utils.data import update_data
from .routes.proxy import start_cache_cleanup_thread
from .utils.data import maybe_update_data
# Configure logging
logger = logging.getLogger(__name__)
@ -19,47 +17,30 @@ app.config.from_object(Config)
logger.debug("Initializing routes")
init_routes(app)
logger.debug("Performing initial data update")
update_data(app)
maybe_update_data(app)
def background_update_data(app):
"""Runs the update_data function every 5 minutes.
This replaces the need for a cron job to update the data.
Args:
app (Flask): The Flask app instance.
"""
logger.debug("Starting background update thread")
while True:
logger.debug("Running scheduled data update")
update_data(app)
logger.debug("Data update complete, sleeping for 5 minutes")
time.sleep(300)
def main():
if app.config["DEBUG"]:
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
else:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger.debug("Starting background update thread")
threading.Thread(target=background_update_data, args=(app,), daemon=True).start()
# Start the cache cleanup thread
start_cache_cleanup_thread(app)
logger.info(f"Starting Structables on {app.config['LISTEN_HOST']}:{app.config['PORT']}")
logger.info(
f"Starting Structables on {app.config['LISTEN_HOST']}:{app.config['PORT']}"
)
app.run(
port=app.config["PORT"],
host=app.config["LISTEN_HOST"],
debug=app.config["DEBUG"],
)
if __name__ == "__main__":
main()
main()

View file

@ -6,13 +6,16 @@ from urllib.parse import quote
from werkzeug.exceptions import InternalServerError
from markdown2 import Markdown
from traceback import print_exc
import pathlib
import json
import logging
import random
from ..utils.data import update_data
from ..utils.helpers import explore_lists, proxy
from .category import project_list
from ..utils.data import maybe_update_data
logger = logging.getLogger(__name__)
@ -22,6 +25,10 @@ def init_main_routes(app):
def route_explore():
logger.debug("Rendering explore page")
# Occasionally trigger data update (every 20th request)
if random.randint(1, 20) == 1:
maybe_update_data(app)
try:
logger.debug("Fetching data from instructables.com")
data = urlopen("https://www.instructables.com/")

View file

@ -7,14 +7,12 @@ import logging
import os
import hashlib
import time
import threading
import shutil
import random
logger = logging.getLogger(__name__)
# Cache cleanup thread reference
cache_cleanup_thread = None
# Track last cache cleanup time
last_cache_cleanup = 0
def get_cache_path(app, url):
@ -95,24 +93,35 @@ def get_content_type(cache_path):
return "application/octet-stream"
def cache_cleanup(app):
"""Clean up the cache directory to stay within size limits.
This function removes the oldest files first until the cache size
is below the maximum size.
def maybe_cleanup_cache(app):
"""Clean up the cache directory if it's time to do so.
Args:
app: The Flask app instance.
"""
global last_cache_cleanup
# If caching is disabled, don't do anything
if not app.config["CACHE_ENABLED"]:
return
# Check if it's time to run cleanup
current_time = time.time()
cleanup_interval = app.config["CACHE_CLEANUP_INTERVAL"]
if current_time - last_cache_cleanup < cleanup_interval:
logger.debug(
f"Cache cleanup skipped. Time since last cleanup: {current_time - last_cache_cleanup:.2f} seconds"
)
return
logger.debug("Starting cache cleanup")
last_cache_cleanup = current_time
try:
cache_dir = app.config["CACHE_DIR"]
max_size = app.config["CACHE_MAX_SIZE"]
max_age = app.config["CACHE_MAX_AGE"]
# Get all cache files with their modification times
cache_files = []
@ -121,6 +130,10 @@ def cache_cleanup(app):
for filename in os.listdir(cache_dir):
file_path = os.path.join(cache_dir, filename)
if os.path.isfile(file_path):
# Skip metadata files in the count
if file_path.endswith(".meta"):
continue
file_size = os.path.getsize(file_path)
file_time = os.path.getmtime(file_path)
total_size += file_size
@ -128,7 +141,34 @@ def cache_cleanup(app):
logger.debug(f"Current cache size: {total_size / (1024 * 1024):.2f} MB")
# If we're over the size limit, remove oldest files first
# First, remove expired files
current_time = time.time()
expired_files = [
(path, mtime, size)
for path, mtime, size in cache_files
if current_time - mtime > max_age
]
for file_path, _, file_size in expired_files:
try:
os.remove(file_path)
# Also remove metadata file if it exists
meta_path = file_path + ".meta"
if os.path.exists(meta_path):
os.remove(meta_path)
total_size -= file_size
logger.debug(f"Removed expired cache file: {file_path}")
except OSError:
logger.warning(f"Failed to remove expired cache file: {file_path}")
# Remove files from the list that we've already deleted
cache_files = [
(path, mtime, size)
for path, mtime, size in cache_files
if (path, mtime, size) not in expired_files
]
# If we're still over the size limit, remove oldest files first
if total_size > max_size:
logger.debug("Cache size exceeds limit, cleaning up")
# Sort by modification time (oldest first)
@ -147,7 +187,7 @@ def cache_cleanup(app):
os.remove(meta_path)
total_size -= file_size
logger.debug(f"Removed cache file: {file_path}")
logger.debug(f"Removed old cache file: {file_path}")
except OSError:
logger.warning(f"Failed to remove cache file: {file_path}")
@ -159,40 +199,6 @@ def cache_cleanup(app):
logger.error(f"Error during cache cleanup: {str(e)}")
def start_cache_cleanup_thread(app):
"""Start a background thread to periodically clean up the cache.
Args:
app: The Flask app instance.
"""
global cache_cleanup_thread
# If thread is already running, don't start another one
if cache_cleanup_thread is not None and cache_cleanup_thread.is_alive():
return
# If caching is disabled, don't start the thread
if not app.config["CACHE_ENABLED"]:
logger.debug("Caching is disabled, not starting cache cleanup thread")
return
def cleanup_worker():
while True:
try:
with app.app_context():
cache_cleanup(app)
cleanup_interval = app.config["CACHE_CLEANUP_INTERVAL"]
time.sleep(cleanup_interval)
except Exception as e:
logger.error(f"Error in cache cleanup worker: {str(e)}")
# Sleep a bit to avoid tight loop in case of recurring errors
time.sleep(60)
cache_cleanup_thread = threading.Thread(target=cleanup_worker, daemon=True)
cache_cleanup_thread.start()
logger.debug("Started cache cleanup background thread")
def init_proxy_routes(app):
# Create cache directory if it doesn't exist and caching is enabled
if app.config["CACHE_ENABLED"]:
@ -216,12 +222,8 @@ def init_proxy_routes(app):
logger.debug(f"Proxy request for URL: {url}, filename: {filename}")
# Check if the cache cleanup thread is running
if cache_cleanup_thread is None:
# Use every 100th request to trigger cleanup
if random.randint(1, 100) == 1:
logger.debug("Triggering cache cleanup")
cache_cleanup(app)
# Clean up the cache if needed
maybe_cleanup_cache(app)
if url is not None:
if url.startswith("https://cdn.instructables.com/") or url.startswith(

View file

@ -1,10 +1,14 @@
from urllib.request import urlopen
import logging
import time
from bs4 import BeautifulSoup
from .helpers import proxy, projects_search
logger = logging.getLogger(__name__)
# Track the last data update time
last_data_update = 0
def update_data(app):
"""Update the application's cached data.
@ -73,4 +77,23 @@ def update_data(app):
logger.debug(f"Updated global projects list with {len(app.global_ibles['/projects'])} projects")
logger.debug("Data update completed successfully")
except Exception as e:
logger.error(f"Error updating data: {str(e)}")
logger.error(f"Error updating data: {str(e)}")
def maybe_update_data(app):
"""Updates the data if it's time to do so.
This replaces the background thread with a request-triggered update.
Args:
app (Flask): The Flask app instance.
"""
global last_data_update
current_time = time.time()
# Update every 5 minutes (300 seconds)
if current_time - last_data_update >= 300:
logger.debug("Running scheduled data update")
update_data(app)
last_data_update = current_time
logger.debug("Data update complete")