feat: Caching proxied content

Some linting
This commit is contained in:
Kumi 2025-04-09 13:45:42 +02:00
parent dd043bd397
commit e86c24251d
Signed by: kumi
GPG key ID: ECBCC9082395383F
8 changed files with 358 additions and 42 deletions

View file

@ -14,20 +14,24 @@ An open source alternative front-end to Instructables. This is a fork of <a href
## Instances
<!-- START_INSTANCE_LIST type:eq=clearnet -->
| URL | Provided by | Country | Notes |
| ---------------------------------------------------------------------- | ---------------------------------------------- | ---------------- | ------------- |
| [structables.private.coffee](https://structables.private.coffee) | [Private.coffee](https://private.coffee) | Austria 🇦🇹 🇪🇺 | Main instance |
| [structables.bloat.cat](https://structables.bloat.cat) | [Bloat.cat](https://bloat.cat) | Germany 🇩🇪 🇪🇺 | |
| [structables.darkness.services](https://structables.darkness.services) | [Darkness.services](https://darkness.services) | United States 🇺🇸 | |
| [structables.private.coffee](https://structables.private.coffee) | [Private.coffee](https://private.coffee) | Austria 🇦🇹 🇪🇺 | Main instance |
| [structables.bloat.cat](https://structables.bloat.cat) | [Bloat.cat](https://bloat.cat) | Germany 🇩🇪 🇪🇺 | |
| [structables.darkness.services](https://structables.darkness.services) | [Darkness.services](https://darkness.services) | United States 🇺🇸 | |
<!-- END_INSTANCE_LIST -->
### Tor Hidden Services
<!-- START_INSTANCE_LIST type:eq=onion -->
| URL | Provided by | Country | Notes |
| --------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------- | ---------------- | ------------- |
| [structables.coffee2m3bjsrrqqycx6ghkxrnejl2q6nl7pjw2j4clchjj6uk5zozad.onion](http://structables.coffee2m3bjsrrqqycx6ghkxrnejl2q6nl7pjw2j4clchjj6uk5zozad.onion) | [Private.coffee](https://private.coffee) | Austria 🇦🇹 🇪🇺 | Main instance |
| [structables.darknessrdor43qkl2ngwitj72zdavfz2cead4t5ed72bybgauww5lyd.onion](http://structables.darknessrdor43qkl2ngwitj72zdavfz2cead4t5ed72bybgauww5lyd.onion) | [Darkness.services](https://darkness.services) | United States 🇺🇸 | |
| [structables.coffee2m3bjsrrqqycx6ghkxrnejl2q6nl7pjw2j4clchjj6uk5zozad.onion](http://structables.coffee2m3bjsrrqqycx6ghkxrnejl2q6nl7pjw2j4clchjj6uk5zozad.onion) | [Private.coffee](https://private.coffee) | Austria 🇦🇹 🇪🇺 | Main instance |
| [structables.darknessrdor43qkl2ngwitj72zdavfz2cead4t5ed72bybgauww5lyd.onion](http://structables.darknessrdor43qkl2ngwitj72zdavfz2cead4t5ed72bybgauww5lyd.onion) | [Darkness.services](https://darkness.services) | United States 🇺🇸 | |
<!-- END_INSTANCE_LIST -->
### Adding Your Instance
@ -86,6 +90,11 @@ Structables supports the use of the following environment variables for configur
- `STRUCTABLES_PRIVACY_FILE`: The path to a text file or Markdown file (with .md suffix) to use for the Privacy Policy page (if unset, try `privacy.txt` or `privacy.md` in the working directory, or fall back to a generic message)
- `STRUCTABLES_DEBUG`: If set, log additional debug information to stdout
- `STRUCTABLES_THEME`: Allows selecting a theme for the frontend. Currently, only `dark` and `light` are supported. If not set, it will be automatically detected based on the user's system settings, and a toggle will be provided in the header.
- `STRUCTABLES_CACHE_ENABLED`: Whether to enable caching of proxied content (default: true). Set to "false" or "0" to disable caching.
- `STRUCTABLES_CACHE_DIR`: The directory to use for caching proxied content (default: `structables_cache` within the temporary directory as returned by `tempfile.gettempdir()`)
- `STRUCTABLES_CACHE_MAX_AGE`: The maximum age of cached content in seconds before it's considered stale (default: 604800 seconds, or 1 week)
- `STRUCTABLES_CACHE_MAX_SIZE`: The maximum size of the cache directory in bytes (default: 1073741824 bytes, or 1GB)
- `STRUCTABLES_CACHE_CLEANUP_INTERVAL`: How often to run the cache cleanup process in seconds (default: 3600 seconds, or 1 hour)
## License

View file

@ -1,4 +1,6 @@
ruff
black
isort
mypy
mypy
types-beautifulsoup4
types-colorama

View file

@ -1,4 +1,8 @@
import os
import tempfile
from .utils.helpers import get_typesense_api_key
class Config:
DEBUG = os.environ.get("FLASK_DEBUG", os.environ.get("STRUCTABLES_DEBUG", False))
@ -8,7 +12,33 @@ class Config:
UNSAFE = os.environ.get("STRUCTABLES_UNSAFE", False)
PRIVACY_FILE = os.environ.get("STRUCTABLES_PRIVACY_FILE")
THEME = os.environ.get("STRUCTABLES_THEME", "auto")
TYPESENSE_API_KEY = get_typesense_api_key()
# Cache settings
CACHE_ENABLED = os.environ.get("STRUCTABLES_CACHE_ENABLED", "true").lower() not in (
"false",
"0",
"no",
"off",
"n",
)
CACHE_DIR = os.environ.get("STRUCTABLES_CACHE_DIR")
if CACHE_DIR is None:
CACHE_DIR = os.path.join(
tempfile.gettempdir(), "structables_cache"
)
CACHE_MAX_AGE = int(
os.environ.get("STRUCTABLES_CACHE_MAX_AGE", 60 * 60 * 24 * 7)
) # 1 week default
CACHE_MAX_SIZE = int(
os.environ.get("STRUCTABLES_CACHE_MAX_SIZE", 1024 * 1024 * 1024)
) # 1GB default
CACHE_CLEANUP_INTERVAL = int(
os.environ.get("STRUCTABLES_CACHE_CLEANUP_INTERVAL", 60 * 60)
) # 1 hour default
@staticmethod
def init_app(app):
pass
pass

View file

@ -8,21 +8,19 @@ import logging
from .config import Config
from .routes import init_routes
from .utils.data import update_data
from .utils.helpers import get_typesense_api_key
from .routes.proxy import start_cache_cleanup_thread
# Configure logging
logger = logging.getLogger(__name__)
app = Flask(__name__, template_folder="templates", static_folder="static")
app.config.from_object(Config)
app.typesense_api_key = get_typesense_api_key()
logger.debug("Initializing routes")
init_routes(app)
logger.debug("Performing initial data update")
update_data(app)
def background_update_data(app):
"""Runs the update_data function every 5 minutes.
@ -38,31 +36,30 @@ def background_update_data(app):
logger.debug("Data update complete, sleeping for 5 minutes")
time.sleep(300)
def main():
if app.config["DEBUG"]:
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
else:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger.debug("Starting background update thread")
threading.Thread(target=background_update_data, args=(app,), daemon=True).start()
logger.info(
f"Starting Structables on {app.config['LISTEN_HOST']}:{app.config['PORT']}"
)
# Start the cache cleanup thread
start_cache_cleanup_thread(app)
logger.info(f"Starting Structables on {app.config['LISTEN_HOST']}:{app.config['PORT']}")
app.run(
port=app.config["PORT"],
host=app.config["LISTEN_HOST"],
debug=app.config["DEBUG"],
)
if __name__ == "__main__":
main()
main()

View file

@ -74,8 +74,8 @@ def init_contest_routes(app):
)
def get_entries(contest):
base_url = f"https://www.instructables.com/api_proxy/search/collections/projects/documents/search"
headers = {"x-typesense-api-key": app.typesense_api_key}
base_url = "https://www.instructables.com/api_proxy/search/collections/projects/documents/search"
headers = {"x-typesense-api-key": app.config["TYPESENSE_API_KEY"]}
page, per_page = 1, 100
all_entries = []
@ -177,7 +177,7 @@ def init_contest_routes(app):
"https://www.instructables.com/json-api/getCurrentContests?limit=50&offset=0"
)
data = json.loads(response.read().decode())
logger.debug(f"Received current contests data")
logger.debug("Received current contests data")
except HTTPError as e:
logger.error(f"HTTP error fetching current contests: {e.code}")
abort(e.code)

View file

@ -134,7 +134,7 @@ def init_main_routes(app):
f"https://www.instructables.com/json-api/showInstructableModel?urlString={article}"
)
data = json.loads(data.read().decode())
logger.debug(f"Successfully fetched article data")
logger.debug("Successfully fetched article data")
except HTTPError as e:
logger.error(f"HTTP error fetching article: {e.code}")
abort(e.code)

View file

@ -4,15 +4,215 @@ from urllib.parse import unquote
from urllib.error import HTTPError
from urllib.request import urlopen
import logging
import os
import hashlib
import time
import threading
import shutil
logger = logging.getLogger(__name__)
# Cache cleanup thread reference
cache_cleanup_thread = None
def get_cache_path(app, url):
"""Generate a cache file path for a URL.
Args:
app: The Flask app instance.
url (str): The URL to cache.
Returns:
str: The path to the cache file.
"""
# Create a hash of the URL to use as the filename
url_hash = hashlib.sha256(url.encode()).hexdigest()
cache_dir = app.config["CACHE_DIR"]
return os.path.join(cache_dir, url_hash)
def is_cached(app, url):
"""Check if a URL is cached and not expired.
Args:
app: The Flask app instance.
url (str): The URL to check.
Returns:
bool: True if the URL is cached and not expired, False otherwise.
"""
# If caching is disabled, always return False
if not app.config["CACHE_ENABLED"]:
return False
cache_path = get_cache_path(app, url)
# Check if the file exists
if not os.path.exists(cache_path):
return False
# Check if the cache has expired
cache_time = os.path.getmtime(cache_path)
max_age = app.config["CACHE_MAX_AGE"]
if time.time() - cache_time > max_age:
# Cache has expired, remove it
try:
os.remove(cache_path)
# Also remove metadata file if it exists
meta_path = cache_path + ".meta"
if os.path.exists(meta_path):
os.remove(meta_path)
return False
except OSError:
logger.warning(f"Failed to remove expired cache file: {cache_path}")
return False
# Cache exists and is not expired
return True
def get_content_type(cache_path):
"""Get the content type from a cache file.
Args:
cache_path (str): The path to the cache file.
Returns:
str: The content type, or 'application/octet-stream' if not found.
"""
meta_path = cache_path + ".meta"
if os.path.exists(meta_path):
try:
with open(meta_path, "r") as f:
return f.read().strip()
except OSError:
logger.warning(
f"Failed to read content type from cache metadata: {meta_path}"
)
return "application/octet-stream"
def cache_cleanup(app):
"""Clean up the cache directory to stay within size limits.
This function removes the oldest files first until the cache size
is below the maximum size.
Args:
app: The Flask app instance.
"""
# If caching is disabled, don't do anything
if not app.config["CACHE_ENABLED"]:
return
logger.debug("Starting cache cleanup")
try:
cache_dir = app.config["CACHE_DIR"]
max_size = app.config["CACHE_MAX_SIZE"]
# Get all cache files with their modification times
cache_files = []
total_size = 0
for filename in os.listdir(cache_dir):
file_path = os.path.join(cache_dir, filename)
if os.path.isfile(file_path):
file_size = os.path.getsize(file_path)
file_time = os.path.getmtime(file_path)
total_size += file_size
cache_files.append((file_path, file_time, file_size))
logger.debug(f"Current cache size: {total_size / (1024 * 1024):.2f} MB")
# If we're over the size limit, remove oldest files first
if total_size > max_size:
logger.debug("Cache size exceeds limit, cleaning up")
# Sort by modification time (oldest first)
cache_files.sort(key=lambda x: x[1])
# Remove files until we're under the limit
for file_path, _, file_size in cache_files:
if total_size <= max_size:
break
try:
os.remove(file_path)
# Also remove metadata file if it exists
meta_path = file_path + ".meta"
if os.path.exists(meta_path):
os.remove(meta_path)
total_size -= file_size
logger.debug(f"Removed cache file: {file_path}")
except OSError:
logger.warning(f"Failed to remove cache file: {file_path}")
logger.debug(
f"Cache cleanup complete. New size: {total_size / (1024 * 1024):.2f} MB"
)
except Exception as e:
logger.error(f"Error during cache cleanup: {str(e)}")
def start_cache_cleanup_thread(app):
"""Start a background thread to periodically clean up the cache.
Args:
app: The Flask app instance.
"""
global cache_cleanup_thread
# If thread is already running, don't start another one
if cache_cleanup_thread is not None and cache_cleanup_thread.is_alive():
return
# If caching is disabled, don't start the thread
if not app.config["CACHE_ENABLED"]:
logger.debug("Caching is disabled, not starting cache cleanup thread")
return
def cleanup_worker():
while True:
try:
with app.app_context():
cache_cleanup(app)
cleanup_interval = app.config["CACHE_CLEANUP_INTERVAL"]
time.sleep(cleanup_interval)
except Exception as e:
logger.error(f"Error in cache cleanup worker: {str(e)}")
# Sleep a bit to avoid tight loop in case of recurring errors
time.sleep(60)
cache_cleanup_thread = threading.Thread(target=cleanup_worker, daemon=True)
cache_cleanup_thread.start()
logger.debug("Started cache cleanup background thread")
def init_proxy_routes(app):
# Create cache directory if it doesn't exist and caching is enabled
if app.config["CACHE_ENABLED"]:
cache_dir = app.config["CACHE_DIR"]
os.makedirs(cache_dir, exist_ok=True)
logger.debug(f"Cache directory: {cache_dir}")
logger.debug(f"Cache max age: {app.config['CACHE_MAX_AGE']} seconds")
logger.debug(
f"Cache max size: {app.config['CACHE_MAX_SIZE'] / (1024 * 1024):.2f} MB"
)
logger.debug(
f"Cache cleanup interval: {app.config['CACHE_CLEANUP_INTERVAL']} seconds"
)
else:
logger.debug("Caching is disabled")
@app.route("/proxy/")
def route_proxy():
url = request.args.get("url")
filename = request.args.get("filename")
logger.debug(f"Proxy request for URL: {url}, filename: {filename}")
if url is not None:
@ -20,27 +220,102 @@ def init_proxy_routes(app):
"https://content.instructables.com/"
):
logger.debug(f"Valid proxy URL: {url}")
unquoted_url = unquote(url)
def generate():
# Subfunction to allow streaming the data instead of
# downloading all of it at once
try:
logger.debug(f"Opening connection to {url}")
with urlopen(unquote(url)) as data:
logger.debug("Connection established, streaming data")
# Check if the content is already cached
if is_cached(app, unquoted_url):
logger.debug(f"Serving cached content for: {unquoted_url}")
cache_path = get_cache_path(app, unquoted_url)
content_type = get_content_type(cache_path)
def generate_from_cache():
with open(cache_path, "rb") as f:
while True:
chunk = data.read(1024 * 1024)
chunk = f.read(1024 * 1024)
if not chunk:
break
yield chunk
logger.debug("Finished streaming data")
headers = dict()
if filename is not None:
headers["Content-Disposition"] = (
f'attachment; filename="{filename}"'
)
return Response(
generate_from_cache(),
content_type=content_type,
headers=headers,
)
# Content is not cached or caching is disabled, fetch it
def generate_and_maybe_cache():
try:
logger.debug(f"Opening connection to {unquoted_url}")
with urlopen(unquoted_url) as data:
logger.debug("Connection established, streaming data")
# If caching is enabled, cache the content
if app.config["CACHE_ENABLED"]:
cache_path = get_cache_path(app, unquoted_url)
temp_path = cache_path + ".tmp"
with open(temp_path, "wb") as f:
while True:
chunk = data.read(1024 * 1024)
if not chunk:
break
f.write(chunk)
yield chunk
# Save the content type
try:
content_type = data.headers["content-type"]
with open(cache_path + ".meta", "w") as f:
f.write(content_type)
except (KeyError, OSError):
logger.warning(
f"Failed to save content type for: {unquoted_url}"
)
# Rename the temporary file to the final cache file
try:
os.rename(temp_path, cache_path)
logger.debug(
f"Successfully cached content for: {unquoted_url}"
)
except OSError:
logger.warning(
f"Failed to rename temporary cache file: {temp_path}"
)
# Try to copy and delete instead
try:
shutil.copy2(temp_path, cache_path)
os.remove(temp_path)
logger.debug(
f"Successfully cached content using copy method: {unquoted_url}"
)
except OSError:
logger.error(
f"Failed to cache content: {unquoted_url}"
)
else:
# If caching is disabled, just stream the data
while True:
chunk = data.read(1024 * 1024)
if not chunk:
break
yield chunk
except HTTPError as e:
logger.error(f"HTTP error during streaming: {e.code}")
abort(e.code)
except Exception as e:
logger.error(f"Error fetching content: {str(e)}")
abort(500)
try:
logger.debug(f"Getting content type for {url}")
with urlopen(unquote(url)) as data:
logger.debug(f"Getting content type for {unquoted_url}")
with urlopen(unquoted_url) as data:
content_type = data.headers["content-type"]
logger.debug(f"Content type: {content_type}")
except HTTPError as e:
@ -51,14 +326,17 @@ def init_proxy_routes(app):
raise InternalServerError()
headers = dict()
if filename is not None:
headers["Content-Disposition"] = (
f'attachment; filename="{filename}"'
)
logger.debug(f"Added Content-Disposition header for {filename}")
return Response(generate(), content_type=content_type, headers=headers)
return Response(
generate_and_maybe_cache(),
content_type=content_type,
headers=headers,
)
else:
logger.warning(f"Invalid proxy URL: {url}")
raise BadRequest()
@ -70,11 +348,11 @@ def init_proxy_routes(app):
def route_iframe():
url = request.args.get("url")
url = unquote(url)
logger.debug(f"iframe request for URL: {url}")
if url is not None:
return render_template("iframe.html", url=url)
else:
logger.warning("No URL provided for iframe")
raise BadRequest()
raise BadRequest()

View file

@ -519,7 +519,7 @@ def projects_search(
logger.debug(f"Searching projects: query='{query}', filter='{filter_by}', page={page}, per_page={per_page}")
projects_headers = {"x-typesense-api-key": app.typesense_api_key}
projects_headers = {"x-typesense-api-key": app.config["TYPESENSE_API_KEY"]}
request_args = {
"q": query,