feat: Caching proxied content

Some linting
This commit is contained in:
Kumi 2025-04-09 13:45:42 +02:00
parent dd043bd397
commit e86c24251d
Signed by: kumi
GPG key ID: ECBCC9082395383F
8 changed files with 358 additions and 42 deletions

View file

@ -14,20 +14,24 @@ An open source alternative front-end to Instructables. This is a fork of <a href
## Instances ## Instances
<!-- START_INSTANCE_LIST type:eq=clearnet --> <!-- START_INSTANCE_LIST type:eq=clearnet -->
| URL | Provided by | Country | Notes | | URL | Provided by | Country | Notes |
| ---------------------------------------------------------------------- | ---------------------------------------------- | ---------------- | ------------- | | ---------------------------------------------------------------------- | ---------------------------------------------- | ---------------- | ------------- |
| [structables.private.coffee](https://structables.private.coffee) | [Private.coffee](https://private.coffee) | Austria 🇦🇹 🇪🇺 | Main instance | | [structables.private.coffee](https://structables.private.coffee) | [Private.coffee](https://private.coffee) | Austria 🇦🇹 🇪🇺 | Main instance |
| [structables.bloat.cat](https://structables.bloat.cat) | [Bloat.cat](https://bloat.cat) | Germany 🇩🇪 🇪🇺 | | | [structables.bloat.cat](https://structables.bloat.cat) | [Bloat.cat](https://bloat.cat) | Germany 🇩🇪 🇪🇺 | |
| [structables.darkness.services](https://structables.darkness.services) | [Darkness.services](https://darkness.services) | United States 🇺🇸 | | | [structables.darkness.services](https://structables.darkness.services) | [Darkness.services](https://darkness.services) | United States 🇺🇸 | |
<!-- END_INSTANCE_LIST --> <!-- END_INSTANCE_LIST -->
### Tor Hidden Services ### Tor Hidden Services
<!-- START_INSTANCE_LIST type:eq=onion --> <!-- START_INSTANCE_LIST type:eq=onion -->
| URL | Provided by | Country | Notes | | URL | Provided by | Country | Notes |
| --------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------- | ---------------- | ------------- | | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------- | ---------------- | ------------- |
| [structables.coffee2m3bjsrrqqycx6ghkxrnejl2q6nl7pjw2j4clchjj6uk5zozad.onion](http://structables.coffee2m3bjsrrqqycx6ghkxrnejl2q6nl7pjw2j4clchjj6uk5zozad.onion) | [Private.coffee](https://private.coffee) | Austria 🇦🇹 🇪🇺 | Main instance | | [structables.coffee2m3bjsrrqqycx6ghkxrnejl2q6nl7pjw2j4clchjj6uk5zozad.onion](http://structables.coffee2m3bjsrrqqycx6ghkxrnejl2q6nl7pjw2j4clchjj6uk5zozad.onion) | [Private.coffee](https://private.coffee) | Austria 🇦🇹 🇪🇺 | Main instance |
| [structables.darknessrdor43qkl2ngwitj72zdavfz2cead4t5ed72bybgauww5lyd.onion](http://structables.darknessrdor43qkl2ngwitj72zdavfz2cead4t5ed72bybgauww5lyd.onion) | [Darkness.services](https://darkness.services) | United States 🇺🇸 | | | [structables.darknessrdor43qkl2ngwitj72zdavfz2cead4t5ed72bybgauww5lyd.onion](http://structables.darknessrdor43qkl2ngwitj72zdavfz2cead4t5ed72bybgauww5lyd.onion) | [Darkness.services](https://darkness.services) | United States 🇺🇸 | |
<!-- END_INSTANCE_LIST --> <!-- END_INSTANCE_LIST -->
### Adding Your Instance ### Adding Your Instance
@ -86,6 +90,11 @@ Structables supports the use of the following environment variables for configur
- `STRUCTABLES_PRIVACY_FILE`: The path to a text file or Markdown file (with .md suffix) to use for the Privacy Policy page (if unset, try `privacy.txt` or `privacy.md` in the working directory, or fall back to a generic message) - `STRUCTABLES_PRIVACY_FILE`: The path to a text file or Markdown file (with .md suffix) to use for the Privacy Policy page (if unset, try `privacy.txt` or `privacy.md` in the working directory, or fall back to a generic message)
- `STRUCTABLES_DEBUG`: If set, log additional debug information to stdout - `STRUCTABLES_DEBUG`: If set, log additional debug information to stdout
- `STRUCTABLES_THEME`: Allows selecting a theme for the frontend. Currently, only `dark` and `light` are supported. If not set, it will be automatically detected based on the user's system settings, and a toggle will be provided in the header. - `STRUCTABLES_THEME`: Allows selecting a theme for the frontend. Currently, only `dark` and `light` are supported. If not set, it will be automatically detected based on the user's system settings, and a toggle will be provided in the header.
- `STRUCTABLES_CACHE_ENABLED`: Whether to enable caching of proxied content (default: true). Set to "false" or "0" to disable caching.
- `STRUCTABLES_CACHE_DIR`: The directory to use for caching proxied content (default: `structables_cache` within the temporary directory as returned by `tempfile.gettempdir()`)
- `STRUCTABLES_CACHE_MAX_AGE`: The maximum age of cached content in seconds before it's considered stale (default: 604800 seconds, or 1 week)
- `STRUCTABLES_CACHE_MAX_SIZE`: The maximum size of the cache directory in bytes (default: 1073741824 bytes, or 1GB)
- `STRUCTABLES_CACHE_CLEANUP_INTERVAL`: How often to run the cache cleanup process in seconds (default: 3600 seconds, or 1 hour)
## License ## License

View file

@ -2,3 +2,5 @@ ruff
black black
isort isort
mypy mypy
types-beautifulsoup4
types-colorama

View file

@ -1,4 +1,8 @@
import os import os
import tempfile
from .utils.helpers import get_typesense_api_key
class Config: class Config:
DEBUG = os.environ.get("FLASK_DEBUG", os.environ.get("STRUCTABLES_DEBUG", False)) DEBUG = os.environ.get("FLASK_DEBUG", os.environ.get("STRUCTABLES_DEBUG", False))
@ -8,6 +12,32 @@ class Config:
UNSAFE = os.environ.get("STRUCTABLES_UNSAFE", False) UNSAFE = os.environ.get("STRUCTABLES_UNSAFE", False)
PRIVACY_FILE = os.environ.get("STRUCTABLES_PRIVACY_FILE") PRIVACY_FILE = os.environ.get("STRUCTABLES_PRIVACY_FILE")
THEME = os.environ.get("STRUCTABLES_THEME", "auto") THEME = os.environ.get("STRUCTABLES_THEME", "auto")
TYPESENSE_API_KEY = get_typesense_api_key()
# Cache settings
CACHE_ENABLED = os.environ.get("STRUCTABLES_CACHE_ENABLED", "true").lower() not in (
"false",
"0",
"no",
"off",
"n",
)
CACHE_DIR = os.environ.get("STRUCTABLES_CACHE_DIR")
if CACHE_DIR is None:
CACHE_DIR = os.path.join(
tempfile.gettempdir(), "structables_cache"
)
CACHE_MAX_AGE = int(
os.environ.get("STRUCTABLES_CACHE_MAX_AGE", 60 * 60 * 24 * 7)
) # 1 week default
CACHE_MAX_SIZE = int(
os.environ.get("STRUCTABLES_CACHE_MAX_SIZE", 1024 * 1024 * 1024)
) # 1GB default
CACHE_CLEANUP_INTERVAL = int(
os.environ.get("STRUCTABLES_CACHE_CLEANUP_INTERVAL", 60 * 60)
) # 1 hour default
@staticmethod @staticmethod
def init_app(app): def init_app(app):

View file

@ -8,21 +8,19 @@ import logging
from .config import Config from .config import Config
from .routes import init_routes from .routes import init_routes
from .utils.data import update_data from .utils.data import update_data
from .utils.helpers import get_typesense_api_key from .routes.proxy import start_cache_cleanup_thread
# Configure logging # Configure logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
app = Flask(__name__, template_folder="templates", static_folder="static") app = Flask(__name__, template_folder="templates", static_folder="static")
app.config.from_object(Config) app.config.from_object(Config)
app.typesense_api_key = get_typesense_api_key()
logger.debug("Initializing routes") logger.debug("Initializing routes")
init_routes(app) init_routes(app)
logger.debug("Performing initial data update") logger.debug("Performing initial data update")
update_data(app) update_data(app)
def background_update_data(app): def background_update_data(app):
"""Runs the update_data function every 5 minutes. """Runs the update_data function every 5 minutes.
@ -38,31 +36,30 @@ def background_update_data(app):
logger.debug("Data update complete, sleeping for 5 minutes") logger.debug("Data update complete, sleeping for 5 minutes")
time.sleep(300) time.sleep(300)
def main(): def main():
if app.config["DEBUG"]: if app.config["DEBUG"]:
logging.basicConfig( logging.basicConfig(
level=logging.DEBUG, level=logging.DEBUG,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
) )
else: else:
logging.basicConfig( logging.basicConfig(
level=logging.INFO, level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
) )
logger.debug("Starting background update thread") logger.debug("Starting background update thread")
threading.Thread(target=background_update_data, args=(app,), daemon=True).start() threading.Thread(target=background_update_data, args=(app,), daemon=True).start()
logger.info( # Start the cache cleanup thread
f"Starting Structables on {app.config['LISTEN_HOST']}:{app.config['PORT']}" start_cache_cleanup_thread(app)
)
logger.info(f"Starting Structables on {app.config['LISTEN_HOST']}:{app.config['PORT']}")
app.run( app.run(
port=app.config["PORT"], port=app.config["PORT"],
host=app.config["LISTEN_HOST"], host=app.config["LISTEN_HOST"],
debug=app.config["DEBUG"], debug=app.config["DEBUG"],
) )
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View file

@ -74,8 +74,8 @@ def init_contest_routes(app):
) )
def get_entries(contest): def get_entries(contest):
base_url = f"https://www.instructables.com/api_proxy/search/collections/projects/documents/search" base_url = "https://www.instructables.com/api_proxy/search/collections/projects/documents/search"
headers = {"x-typesense-api-key": app.typesense_api_key} headers = {"x-typesense-api-key": app.config["TYPESENSE_API_KEY"]}
page, per_page = 1, 100 page, per_page = 1, 100
all_entries = [] all_entries = []
@ -177,7 +177,7 @@ def init_contest_routes(app):
"https://www.instructables.com/json-api/getCurrentContests?limit=50&offset=0" "https://www.instructables.com/json-api/getCurrentContests?limit=50&offset=0"
) )
data = json.loads(response.read().decode()) data = json.loads(response.read().decode())
logger.debug(f"Received current contests data") logger.debug("Received current contests data")
except HTTPError as e: except HTTPError as e:
logger.error(f"HTTP error fetching current contests: {e.code}") logger.error(f"HTTP error fetching current contests: {e.code}")
abort(e.code) abort(e.code)

View file

@ -134,7 +134,7 @@ def init_main_routes(app):
f"https://www.instructables.com/json-api/showInstructableModel?urlString={article}" f"https://www.instructables.com/json-api/showInstructableModel?urlString={article}"
) )
data = json.loads(data.read().decode()) data = json.loads(data.read().decode())
logger.debug(f"Successfully fetched article data") logger.debug("Successfully fetched article data")
except HTTPError as e: except HTTPError as e:
logger.error(f"HTTP error fetching article: {e.code}") logger.error(f"HTTP error fetching article: {e.code}")
abort(e.code) abort(e.code)

View file

@ -4,10 +4,210 @@ from urllib.parse import unquote
from urllib.error import HTTPError from urllib.error import HTTPError
from urllib.request import urlopen from urllib.request import urlopen
import logging import logging
import os
import hashlib
import time
import threading
import shutil
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Cache cleanup thread reference
cache_cleanup_thread = None
def get_cache_path(app, url):
"""Generate a cache file path for a URL.
Args:
app: The Flask app instance.
url (str): The URL to cache.
Returns:
str: The path to the cache file.
"""
# Create a hash of the URL to use as the filename
url_hash = hashlib.sha256(url.encode()).hexdigest()
cache_dir = app.config["CACHE_DIR"]
return os.path.join(cache_dir, url_hash)
def is_cached(app, url):
"""Check if a URL is cached and not expired.
Args:
app: The Flask app instance.
url (str): The URL to check.
Returns:
bool: True if the URL is cached and not expired, False otherwise.
"""
# If caching is disabled, always return False
if not app.config["CACHE_ENABLED"]:
return False
cache_path = get_cache_path(app, url)
# Check if the file exists
if not os.path.exists(cache_path):
return False
# Check if the cache has expired
cache_time = os.path.getmtime(cache_path)
max_age = app.config["CACHE_MAX_AGE"]
if time.time() - cache_time > max_age:
# Cache has expired, remove it
try:
os.remove(cache_path)
# Also remove metadata file if it exists
meta_path = cache_path + ".meta"
if os.path.exists(meta_path):
os.remove(meta_path)
return False
except OSError:
logger.warning(f"Failed to remove expired cache file: {cache_path}")
return False
# Cache exists and is not expired
return True
def get_content_type(cache_path):
"""Get the content type from a cache file.
Args:
cache_path (str): The path to the cache file.
Returns:
str: The content type, or 'application/octet-stream' if not found.
"""
meta_path = cache_path + ".meta"
if os.path.exists(meta_path):
try:
with open(meta_path, "r") as f:
return f.read().strip()
except OSError:
logger.warning(
f"Failed to read content type from cache metadata: {meta_path}"
)
return "application/octet-stream"
def cache_cleanup(app):
"""Clean up the cache directory to stay within size limits.
This function removes the oldest files first until the cache size
is below the maximum size.
Args:
app: The Flask app instance.
"""
# If caching is disabled, don't do anything
if not app.config["CACHE_ENABLED"]:
return
logger.debug("Starting cache cleanup")
try:
cache_dir = app.config["CACHE_DIR"]
max_size = app.config["CACHE_MAX_SIZE"]
# Get all cache files with their modification times
cache_files = []
total_size = 0
for filename in os.listdir(cache_dir):
file_path = os.path.join(cache_dir, filename)
if os.path.isfile(file_path):
file_size = os.path.getsize(file_path)
file_time = os.path.getmtime(file_path)
total_size += file_size
cache_files.append((file_path, file_time, file_size))
logger.debug(f"Current cache size: {total_size / (1024 * 1024):.2f} MB")
# If we're over the size limit, remove oldest files first
if total_size > max_size:
logger.debug("Cache size exceeds limit, cleaning up")
# Sort by modification time (oldest first)
cache_files.sort(key=lambda x: x[1])
# Remove files until we're under the limit
for file_path, _, file_size in cache_files:
if total_size <= max_size:
break
try:
os.remove(file_path)
# Also remove metadata file if it exists
meta_path = file_path + ".meta"
if os.path.exists(meta_path):
os.remove(meta_path)
total_size -= file_size
logger.debug(f"Removed cache file: {file_path}")
except OSError:
logger.warning(f"Failed to remove cache file: {file_path}")
logger.debug(
f"Cache cleanup complete. New size: {total_size / (1024 * 1024):.2f} MB"
)
except Exception as e:
logger.error(f"Error during cache cleanup: {str(e)}")
def start_cache_cleanup_thread(app):
"""Start a background thread to periodically clean up the cache.
Args:
app: The Flask app instance.
"""
global cache_cleanup_thread
# If thread is already running, don't start another one
if cache_cleanup_thread is not None and cache_cleanup_thread.is_alive():
return
# If caching is disabled, don't start the thread
if not app.config["CACHE_ENABLED"]:
logger.debug("Caching is disabled, not starting cache cleanup thread")
return
def cleanup_worker():
while True:
try:
with app.app_context():
cache_cleanup(app)
cleanup_interval = app.config["CACHE_CLEANUP_INTERVAL"]
time.sleep(cleanup_interval)
except Exception as e:
logger.error(f"Error in cache cleanup worker: {str(e)}")
# Sleep a bit to avoid tight loop in case of recurring errors
time.sleep(60)
cache_cleanup_thread = threading.Thread(target=cleanup_worker, daemon=True)
cache_cleanup_thread.start()
logger.debug("Started cache cleanup background thread")
def init_proxy_routes(app): def init_proxy_routes(app):
# Create cache directory if it doesn't exist and caching is enabled
if app.config["CACHE_ENABLED"]:
cache_dir = app.config["CACHE_DIR"]
os.makedirs(cache_dir, exist_ok=True)
logger.debug(f"Cache directory: {cache_dir}")
logger.debug(f"Cache max age: {app.config['CACHE_MAX_AGE']} seconds")
logger.debug(
f"Cache max size: {app.config['CACHE_MAX_SIZE'] / (1024 * 1024):.2f} MB"
)
logger.debug(
f"Cache cleanup interval: {app.config['CACHE_CLEANUP_INTERVAL']} seconds"
)
else:
logger.debug("Caching is disabled")
@app.route("/proxy/") @app.route("/proxy/")
def route_proxy(): def route_proxy():
url = request.args.get("url") url = request.args.get("url")
@ -20,27 +220,102 @@ def init_proxy_routes(app):
"https://content.instructables.com/" "https://content.instructables.com/"
): ):
logger.debug(f"Valid proxy URL: {url}") logger.debug(f"Valid proxy URL: {url}")
unquoted_url = unquote(url)
def generate(): # Check if the content is already cached
# Subfunction to allow streaming the data instead of if is_cached(app, unquoted_url):
# downloading all of it at once logger.debug(f"Serving cached content for: {unquoted_url}")
cache_path = get_cache_path(app, unquoted_url)
content_type = get_content_type(cache_path)
def generate_from_cache():
with open(cache_path, "rb") as f:
while True:
chunk = f.read(1024 * 1024)
if not chunk:
break
yield chunk
headers = dict()
if filename is not None:
headers["Content-Disposition"] = (
f'attachment; filename="{filename}"'
)
return Response(
generate_from_cache(),
content_type=content_type,
headers=headers,
)
# Content is not cached or caching is disabled, fetch it
def generate_and_maybe_cache():
try: try:
logger.debug(f"Opening connection to {url}") logger.debug(f"Opening connection to {unquoted_url}")
with urlopen(unquote(url)) as data: with urlopen(unquoted_url) as data:
logger.debug("Connection established, streaming data") logger.debug("Connection established, streaming data")
# If caching is enabled, cache the content
if app.config["CACHE_ENABLED"]:
cache_path = get_cache_path(app, unquoted_url)
temp_path = cache_path + ".tmp"
with open(temp_path, "wb") as f:
while True:
chunk = data.read(1024 * 1024)
if not chunk:
break
f.write(chunk)
yield chunk
# Save the content type
try:
content_type = data.headers["content-type"]
with open(cache_path + ".meta", "w") as f:
f.write(content_type)
except (KeyError, OSError):
logger.warning(
f"Failed to save content type for: {unquoted_url}"
)
# Rename the temporary file to the final cache file
try:
os.rename(temp_path, cache_path)
logger.debug(
f"Successfully cached content for: {unquoted_url}"
)
except OSError:
logger.warning(
f"Failed to rename temporary cache file: {temp_path}"
)
# Try to copy and delete instead
try:
shutil.copy2(temp_path, cache_path)
os.remove(temp_path)
logger.debug(
f"Successfully cached content using copy method: {unquoted_url}"
)
except OSError:
logger.error(
f"Failed to cache content: {unquoted_url}"
)
else:
# If caching is disabled, just stream the data
while True: while True:
chunk = data.read(1024 * 1024) chunk = data.read(1024 * 1024)
if not chunk: if not chunk:
break break
yield chunk yield chunk
logger.debug("Finished streaming data")
except HTTPError as e: except HTTPError as e:
logger.error(f"HTTP error during streaming: {e.code}") logger.error(f"HTTP error during streaming: {e.code}")
abort(e.code) abort(e.code)
except Exception as e:
logger.error(f"Error fetching content: {str(e)}")
abort(500)
try: try:
logger.debug(f"Getting content type for {url}") logger.debug(f"Getting content type for {unquoted_url}")
with urlopen(unquote(url)) as data: with urlopen(unquoted_url) as data:
content_type = data.headers["content-type"] content_type = data.headers["content-type"]
logger.debug(f"Content type: {content_type}") logger.debug(f"Content type: {content_type}")
except HTTPError as e: except HTTPError as e:
@ -51,14 +326,17 @@ def init_proxy_routes(app):
raise InternalServerError() raise InternalServerError()
headers = dict() headers = dict()
if filename is not None: if filename is not None:
headers["Content-Disposition"] = ( headers["Content-Disposition"] = (
f'attachment; filename="{filename}"' f'attachment; filename="{filename}"'
) )
logger.debug(f"Added Content-Disposition header for {filename}") logger.debug(f"Added Content-Disposition header for {filename}")
return Response(generate(), content_type=content_type, headers=headers) return Response(
generate_and_maybe_cache(),
content_type=content_type,
headers=headers,
)
else: else:
logger.warning(f"Invalid proxy URL: {url}") logger.warning(f"Invalid proxy URL: {url}")
raise BadRequest() raise BadRequest()

View file

@ -519,7 +519,7 @@ def projects_search(
logger.debug(f"Searching projects: query='{query}', filter='{filter_by}', page={page}, per_page={per_page}") logger.debug(f"Searching projects: query='{query}', filter='{filter_by}', page={page}, per_page={per_page}")
projects_headers = {"x-typesense-api-key": app.typesense_api_key} projects_headers = {"x-typesense-api-key": app.config["TYPESENSE_API_KEY"]}
request_args = { request_args = {
"q": query, "q": query,