feat: Refactor background tasks
This commit is contained in:
parent
7e3e88cb74
commit
37dde40cbf
4 changed files with 95 additions and 82 deletions
|
@ -1,14 +1,12 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
from flask import Flask
|
from flask import Flask
|
||||||
import threading
|
|
||||||
import time
|
|
||||||
import logging
|
import logging
|
||||||
|
import time
|
||||||
|
|
||||||
from .config import Config
|
from .config import Config
|
||||||
from .routes import init_routes
|
from .routes import init_routes
|
||||||
from .utils.data import update_data
|
from .utils.data import maybe_update_data
|
||||||
from .routes.proxy import start_cache_cleanup_thread
|
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
@ -19,47 +17,30 @@ app.config.from_object(Config)
|
||||||
logger.debug("Initializing routes")
|
logger.debug("Initializing routes")
|
||||||
init_routes(app)
|
init_routes(app)
|
||||||
logger.debug("Performing initial data update")
|
logger.debug("Performing initial data update")
|
||||||
update_data(app)
|
maybe_update_data(app)
|
||||||
|
|
||||||
def background_update_data(app):
|
|
||||||
"""Runs the update_data function every 5 minutes.
|
|
||||||
|
|
||||||
This replaces the need for a cron job to update the data.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
app (Flask): The Flask app instance.
|
|
||||||
"""
|
|
||||||
logger.debug("Starting background update thread")
|
|
||||||
while True:
|
|
||||||
logger.debug("Running scheduled data update")
|
|
||||||
update_data(app)
|
|
||||||
logger.debug("Data update complete, sleeping for 5 minutes")
|
|
||||||
time.sleep(300)
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
if app.config["DEBUG"]:
|
if app.config["DEBUG"]:
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.DEBUG,
|
level=logging.DEBUG,
|
||||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.debug("Starting background update thread")
|
logger.info(
|
||||||
threading.Thread(target=background_update_data, args=(app,), daemon=True).start()
|
f"Starting Structables on {app.config['LISTEN_HOST']}:{app.config['PORT']}"
|
||||||
|
)
|
||||||
# Start the cache cleanup thread
|
|
||||||
start_cache_cleanup_thread(app)
|
|
||||||
|
|
||||||
logger.info(f"Starting Structables on {app.config['LISTEN_HOST']}:{app.config['PORT']}")
|
|
||||||
app.run(
|
app.run(
|
||||||
port=app.config["PORT"],
|
port=app.config["PORT"],
|
||||||
host=app.config["LISTEN_HOST"],
|
host=app.config["LISTEN_HOST"],
|
||||||
debug=app.config["DEBUG"],
|
debug=app.config["DEBUG"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
|
@ -6,13 +6,16 @@ from urllib.parse import quote
|
||||||
from werkzeug.exceptions import InternalServerError
|
from werkzeug.exceptions import InternalServerError
|
||||||
from markdown2 import Markdown
|
from markdown2 import Markdown
|
||||||
from traceback import print_exc
|
from traceback import print_exc
|
||||||
|
|
||||||
import pathlib
|
import pathlib
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import random
|
||||||
|
|
||||||
from ..utils.data import update_data
|
from ..utils.data import update_data
|
||||||
from ..utils.helpers import explore_lists, proxy
|
from ..utils.helpers import explore_lists, proxy
|
||||||
from .category import project_list
|
from .category import project_list
|
||||||
|
from ..utils.data import maybe_update_data
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -22,6 +25,10 @@ def init_main_routes(app):
|
||||||
def route_explore():
|
def route_explore():
|
||||||
logger.debug("Rendering explore page")
|
logger.debug("Rendering explore page")
|
||||||
|
|
||||||
|
# Occasionally trigger data update (every 20th request)
|
||||||
|
if random.randint(1, 20) == 1:
|
||||||
|
maybe_update_data(app)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logger.debug("Fetching data from instructables.com")
|
logger.debug("Fetching data from instructables.com")
|
||||||
data = urlopen("https://www.instructables.com/")
|
data = urlopen("https://www.instructables.com/")
|
||||||
|
|
|
@ -7,14 +7,12 @@ import logging
|
||||||
import os
|
import os
|
||||||
import hashlib
|
import hashlib
|
||||||
import time
|
import time
|
||||||
import threading
|
|
||||||
import shutil
|
import shutil
|
||||||
import random
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Cache cleanup thread reference
|
# Track last cache cleanup time
|
||||||
cache_cleanup_thread = None
|
last_cache_cleanup = 0
|
||||||
|
|
||||||
|
|
||||||
def get_cache_path(app, url):
|
def get_cache_path(app, url):
|
||||||
|
@ -95,24 +93,35 @@ def get_content_type(cache_path):
|
||||||
return "application/octet-stream"
|
return "application/octet-stream"
|
||||||
|
|
||||||
|
|
||||||
def cache_cleanup(app):
|
def maybe_cleanup_cache(app):
|
||||||
"""Clean up the cache directory to stay within size limits.
|
"""Clean up the cache directory if it's time to do so.
|
||||||
|
|
||||||
This function removes the oldest files first until the cache size
|
|
||||||
is below the maximum size.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
app: The Flask app instance.
|
app: The Flask app instance.
|
||||||
"""
|
"""
|
||||||
|
global last_cache_cleanup
|
||||||
|
|
||||||
# If caching is disabled, don't do anything
|
# If caching is disabled, don't do anything
|
||||||
if not app.config["CACHE_ENABLED"]:
|
if not app.config["CACHE_ENABLED"]:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Check if it's time to run cleanup
|
||||||
|
current_time = time.time()
|
||||||
|
cleanup_interval = app.config["CACHE_CLEANUP_INTERVAL"]
|
||||||
|
|
||||||
|
if current_time - last_cache_cleanup < cleanup_interval:
|
||||||
|
logger.debug(
|
||||||
|
f"Cache cleanup skipped. Time since last cleanup: {current_time - last_cache_cleanup:.2f} seconds"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
logger.debug("Starting cache cleanup")
|
logger.debug("Starting cache cleanup")
|
||||||
|
last_cache_cleanup = current_time
|
||||||
|
|
||||||
try:
|
try:
|
||||||
cache_dir = app.config["CACHE_DIR"]
|
cache_dir = app.config["CACHE_DIR"]
|
||||||
max_size = app.config["CACHE_MAX_SIZE"]
|
max_size = app.config["CACHE_MAX_SIZE"]
|
||||||
|
max_age = app.config["CACHE_MAX_AGE"]
|
||||||
|
|
||||||
# Get all cache files with their modification times
|
# Get all cache files with their modification times
|
||||||
cache_files = []
|
cache_files = []
|
||||||
|
@ -121,6 +130,10 @@ def cache_cleanup(app):
|
||||||
for filename in os.listdir(cache_dir):
|
for filename in os.listdir(cache_dir):
|
||||||
file_path = os.path.join(cache_dir, filename)
|
file_path = os.path.join(cache_dir, filename)
|
||||||
if os.path.isfile(file_path):
|
if os.path.isfile(file_path):
|
||||||
|
# Skip metadata files in the count
|
||||||
|
if file_path.endswith(".meta"):
|
||||||
|
continue
|
||||||
|
|
||||||
file_size = os.path.getsize(file_path)
|
file_size = os.path.getsize(file_path)
|
||||||
file_time = os.path.getmtime(file_path)
|
file_time = os.path.getmtime(file_path)
|
||||||
total_size += file_size
|
total_size += file_size
|
||||||
|
@ -128,7 +141,34 @@ def cache_cleanup(app):
|
||||||
|
|
||||||
logger.debug(f"Current cache size: {total_size / (1024 * 1024):.2f} MB")
|
logger.debug(f"Current cache size: {total_size / (1024 * 1024):.2f} MB")
|
||||||
|
|
||||||
# If we're over the size limit, remove oldest files first
|
# First, remove expired files
|
||||||
|
current_time = time.time()
|
||||||
|
expired_files = [
|
||||||
|
(path, mtime, size)
|
||||||
|
for path, mtime, size in cache_files
|
||||||
|
if current_time - mtime > max_age
|
||||||
|
]
|
||||||
|
|
||||||
|
for file_path, _, file_size in expired_files:
|
||||||
|
try:
|
||||||
|
os.remove(file_path)
|
||||||
|
# Also remove metadata file if it exists
|
||||||
|
meta_path = file_path + ".meta"
|
||||||
|
if os.path.exists(meta_path):
|
||||||
|
os.remove(meta_path)
|
||||||
|
total_size -= file_size
|
||||||
|
logger.debug(f"Removed expired cache file: {file_path}")
|
||||||
|
except OSError:
|
||||||
|
logger.warning(f"Failed to remove expired cache file: {file_path}")
|
||||||
|
|
||||||
|
# Remove files from the list that we've already deleted
|
||||||
|
cache_files = [
|
||||||
|
(path, mtime, size)
|
||||||
|
for path, mtime, size in cache_files
|
||||||
|
if (path, mtime, size) not in expired_files
|
||||||
|
]
|
||||||
|
|
||||||
|
# If we're still over the size limit, remove oldest files first
|
||||||
if total_size > max_size:
|
if total_size > max_size:
|
||||||
logger.debug("Cache size exceeds limit, cleaning up")
|
logger.debug("Cache size exceeds limit, cleaning up")
|
||||||
# Sort by modification time (oldest first)
|
# Sort by modification time (oldest first)
|
||||||
|
@ -147,7 +187,7 @@ def cache_cleanup(app):
|
||||||
os.remove(meta_path)
|
os.remove(meta_path)
|
||||||
|
|
||||||
total_size -= file_size
|
total_size -= file_size
|
||||||
logger.debug(f"Removed cache file: {file_path}")
|
logger.debug(f"Removed old cache file: {file_path}")
|
||||||
except OSError:
|
except OSError:
|
||||||
logger.warning(f"Failed to remove cache file: {file_path}")
|
logger.warning(f"Failed to remove cache file: {file_path}")
|
||||||
|
|
||||||
|
@ -159,40 +199,6 @@ def cache_cleanup(app):
|
||||||
logger.error(f"Error during cache cleanup: {str(e)}")
|
logger.error(f"Error during cache cleanup: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
def start_cache_cleanup_thread(app):
|
|
||||||
"""Start a background thread to periodically clean up the cache.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
app: The Flask app instance.
|
|
||||||
"""
|
|
||||||
global cache_cleanup_thread
|
|
||||||
|
|
||||||
# If thread is already running, don't start another one
|
|
||||||
if cache_cleanup_thread is not None and cache_cleanup_thread.is_alive():
|
|
||||||
return
|
|
||||||
|
|
||||||
# If caching is disabled, don't start the thread
|
|
||||||
if not app.config["CACHE_ENABLED"]:
|
|
||||||
logger.debug("Caching is disabled, not starting cache cleanup thread")
|
|
||||||
return
|
|
||||||
|
|
||||||
def cleanup_worker():
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
with app.app_context():
|
|
||||||
cache_cleanup(app)
|
|
||||||
cleanup_interval = app.config["CACHE_CLEANUP_INTERVAL"]
|
|
||||||
time.sleep(cleanup_interval)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error in cache cleanup worker: {str(e)}")
|
|
||||||
# Sleep a bit to avoid tight loop in case of recurring errors
|
|
||||||
time.sleep(60)
|
|
||||||
|
|
||||||
cache_cleanup_thread = threading.Thread(target=cleanup_worker, daemon=True)
|
|
||||||
cache_cleanup_thread.start()
|
|
||||||
logger.debug("Started cache cleanup background thread")
|
|
||||||
|
|
||||||
|
|
||||||
def init_proxy_routes(app):
|
def init_proxy_routes(app):
|
||||||
# Create cache directory if it doesn't exist and caching is enabled
|
# Create cache directory if it doesn't exist and caching is enabled
|
||||||
if app.config["CACHE_ENABLED"]:
|
if app.config["CACHE_ENABLED"]:
|
||||||
|
@ -216,12 +222,8 @@ def init_proxy_routes(app):
|
||||||
|
|
||||||
logger.debug(f"Proxy request for URL: {url}, filename: {filename}")
|
logger.debug(f"Proxy request for URL: {url}, filename: {filename}")
|
||||||
|
|
||||||
# Check if the cache cleanup thread is running
|
# Clean up the cache if needed
|
||||||
if cache_cleanup_thread is None:
|
maybe_cleanup_cache(app)
|
||||||
# Use every 100th request to trigger cleanup
|
|
||||||
if random.randint(1, 100) == 1:
|
|
||||||
logger.debug("Triggering cache cleanup")
|
|
||||||
cache_cleanup(app)
|
|
||||||
|
|
||||||
if url is not None:
|
if url is not None:
|
||||||
if url.startswith("https://cdn.instructables.com/") or url.startswith(
|
if url.startswith("https://cdn.instructables.com/") or url.startswith(
|
||||||
|
|
|
@ -1,10 +1,14 @@
|
||||||
from urllib.request import urlopen
|
from urllib.request import urlopen
|
||||||
import logging
|
import logging
|
||||||
|
import time
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from .helpers import proxy, projects_search
|
from .helpers import proxy, projects_search
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Track the last data update time
|
||||||
|
last_data_update = 0
|
||||||
|
|
||||||
def update_data(app):
|
def update_data(app):
|
||||||
"""Update the application's cached data.
|
"""Update the application's cached data.
|
||||||
|
|
||||||
|
@ -74,3 +78,22 @@ def update_data(app):
|
||||||
logger.debug("Data update completed successfully")
|
logger.debug("Data update completed successfully")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error updating data: {str(e)}")
|
logger.error(f"Error updating data: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
def maybe_update_data(app):
|
||||||
|
"""Updates the data if it's time to do so.
|
||||||
|
|
||||||
|
This replaces the background thread with a request-triggered update.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
app (Flask): The Flask app instance.
|
||||||
|
"""
|
||||||
|
global last_data_update
|
||||||
|
current_time = time.time()
|
||||||
|
|
||||||
|
# Update every 5 minutes (300 seconds)
|
||||||
|
if current_time - last_data_update >= 300:
|
||||||
|
logger.debug("Running scheduled data update")
|
||||||
|
update_data(app)
|
||||||
|
last_data_update = current_time
|
||||||
|
logger.debug("Data update complete")
|
Loading…
Add table
Add a link
Reference in a new issue