commit 9e930bddbbd4d4f9115ad3d164528d797f59517e Author: Kumi Date: Wed Sep 6 12:06:20 2023 +0200 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c60b5bc --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +venv/ +*.pyc +__pycache__/ +dist/ +settings.ini \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..4fbda89 --- /dev/null +++ b/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2023 Kumi Systems e.U. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..5336108 --- /dev/null +++ b/README.md @@ -0,0 +1,19 @@ +# S3 Downloader + +This is a simple tool to download files from S3. It is intended to be used as a CLI tool, but can also be used as a library. + +## Requirements + +* Python 3.8+ + +## Installation + +```bash +python -m venv venv +source venv/bin/activate +pip install git+https://kumig.it/kumisystems/s3downloader.git +``` + +## License + +This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..6bc0a7b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,29 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "s3downloader" +version = "0.1.0" +authors = [ + { name="Kumi Mitterer", email="s3downloader@kumi.email" }, +] +description = "Simple Python CLI tool to download files from S3" +readme = "README.md" +license = { file="LICENSE" } +requires-python = ">=3.10" +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] +dependencies = [ + "boto3", +] + +[project.urls] +"Homepage" = "https://kumig.it/kumitterer/s3downloader" +"Bug Tracker" = "https://kumig.it/kumitterer/s3downloader/issues" + +[project.scripts] +s3downloader = "s3downloader.__main__:main" \ No newline at end of file diff --git a/settings.dist.ini b/settings.dist.ini new file mode 100644 index 0000000..971b442 --- /dev/null +++ b/settings.dist.ini @@ -0,0 +1,24 @@ +[S3] + +# The credentials to your S3 bucket +access_key = your-access-key +secret_key = your-secret-key + +# The name of your S3 bucket +bucket_name = your-bucket + +# The path where files are stored within that bucket +# Can be omitted if files are in the base directory +path = in/here/ + +# Where downloaded files will be stored +final_dir = /your/output/directory + +# Delete files from bucket after downloading? (1 = yes, 0 = no) +delete = 1 + +# To add further arguments for boto3.Client, list them below: +# +# endpoint_url = https://my.minio.instance/ +# +# See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html#boto3.session.Session.client \ No newline at end of file diff --git a/src/s3downloader/__init__.py b/src/s3downloader/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/s3downloader/__main__.py b/src/s3downloader/__main__.py new file mode 100644 index 0000000..2d82260 --- /dev/null +++ b/src/s3downloader/__main__.py @@ -0,0 +1,41 @@ +from .classes.client import S3Client +from .classes.config import Config + +import logging + +from argparse import ArgumentParser + +def main(): + parser = ArgumentParser() + + parser.add_argument( + "--config", + type=str, + default="settings.ini", + help="Path to configuration file", + ) + + parser.add_argument( + "--section", + type=str, + default="S3", + help="Section in configuration file", + ) + + parser.add_argument( + "--log", + type=str, + default="INFO", + help="Logging level", + ) + + args = parser.parse_args() + + logging.basicConfig(level=args.log) + + client = S3Client.from_config(args.config, args.section) + + client.process_files() + +if __name__ == '__main__': + main() diff --git a/src/s3downloader/classes/client.py b/src/s3downloader/classes/client.py new file mode 100644 index 0000000..8aaad29 --- /dev/null +++ b/src/s3downloader/classes/client.py @@ -0,0 +1,118 @@ +import boto3 + +import tempfile +import logging + +from botocore.exceptions import NoCredentialsError +from botocore.client import BaseClient + +from pathlib import Path +from os import PathLike +from typing import List + +from .config import Config + + +class S3Client: + def __init__( + self, + access_key: str, + secret_key: str, + bucket_name: str, + path: str, + final_dir: PathLike, + delete: bool = False, + **kwargs, + ): + self.s3 = self.connect(access_key, secret_key, **kwargs) + self.bucket_name = bucket_name + self.path = path + self.final_dir = Path(final_dir) + self.delete = delete + + self.final_dir.mkdir(exist_ok=True) + assert ( + self.final_dir.is_dir() + ), "Final directory does not exist or is not a directory" + + @classmethod + def from_config(cls, config_file: PathLike, section: str = "S3") -> "S3Client": + config = Config(config_file, section) + return cls( + config.access_key, + config.secret_key, + config.bucket_name, + config.path, + config.final_dir, + config.delete, + **config.kwargs, + ) + + def connect(self, access_key: str, secret_key: str, **kwargs) -> BaseClient: + logging.debug("Connecting to S3") + + s3 = boto3.client( + "s3", + aws_access_key_id=access_key, + aws_secret_access_key=secret_key, + **kwargs, + ) + return s3 + + def list_files(self) -> List[str]: + logging.debug("Listing files in S3") + + return [ + obj["Key"] + for obj in self.s3.list_objects( + Bucket=self.bucket_name, Prefix=self.path + ).get("Contents", []) + ] + + def download_files(self) -> bool: + try: + logging.debug("Downloading files") + for obj in self.list_files(): + if not self._exists_local(obj): + self.download_file(obj) + else: + logging.warn(f"File already exists locally, skipping: {obj}") + + except Exception as e: + print(e) + return False + + return True + + def download_file(self, filename: str) -> None: + logging.info(f"Downloading file from S3: {filename}") + with tempfile.TemporaryFile() as temp_file: + self.s3.download_fileobj(self.bucket_name, filename, temp_file) + temp_file.seek(0) + self.move_file(temp_file, filename) + + def move_file(self, temp_file: tempfile.TemporaryFile, filename: str) -> None: + logging.debug(f"Moving file to final directory: {filename}") + + with open(self.final_dir / Path(filename).name, "wb") as final_file: + final_file.write(temp_file.read()) + + def delete_files(self) -> None: + logging.debug("Deleting files from S3") + for obj in self.list_files(): + self.delete_file(obj) + + def delete_file(self, filename) -> None: + logging.info(f"Deleting file from S3: {filename}") + self.s3.delete_object(Bucket=self.bucket_name, Key=filename) + + def process_files(self) -> None: + logging.debug("Processing files") + + if self.download_files() and self.delete: + self.delete_files() + + def _exists_local(self, filename: str) -> bool: + logging.debug(f"Checking if file exists locally: {filename}") + + return Path(self.final_dir / Path(filename).name).exists() diff --git a/src/s3downloader/classes/config.py b/src/s3downloader/classes/config.py new file mode 100644 index 0000000..0a4d1e1 --- /dev/null +++ b/src/s3downloader/classes/config.py @@ -0,0 +1,51 @@ +from configparser import ConfigParser + +import logging + +class Config: + def __init__(self, config_file=[], section="S3"): + logging.debug(f"Reading configuration file(s): {config_file}") + + self.config = ConfigParser() + self.config.read(config_file) + + self.section = section + + @property + def access_key(self): + return self.config[self.section]["access_key"] + + @property + def secret_key(self): + return self.config[self.section]["secret_key"] + + @property + def bucket_name(self): + return self.config[self.section]["bucket_name"] + + @property + def path(self): + return self.config[self.section].get("path", "") + + @property + def final_dir(self): + return self.config[self.section]["final_dir"] + + @property + def delete(self): + return self.config[self.section].getboolean("delete") + + @property + def kwargs(self): + kwargs = {} + for key, value in self.config[self.section].items(): + if not key in [ + "access_key", + "secret_key", + "bucket_name", + "path", + "final_dir", + "delete", + ]: + kwargs[key] = value + return kwargs