Skip to content

Commit 4929da8

Browse files
authored
Merge pull request #39 from choinek/main
[feat] #15 Add S3 storage strategy
2 parents 4973507 + 6659b55 commit 4929da8

File tree

7 files changed

+165
-8
lines changed

7 files changed

+165
-8
lines changed

.env.localhost.example

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,10 @@ CELERY_RESULT_BACKEND=redis://localhost:6379/0
1515
OLLAMA_HOST=http://localhost:11434
1616
APP_ENV=development # Default to development mode
1717

18-
STORAGE_PROFILE_PATH=../storage_profiles
18+
STORAGE_PROFILE_PATH=../storage_profiles
19+
20+
# AWS S3
21+
#AWS_ACCESS_KEY_ID=your-access-key-id
22+
#AWS_SECRET_ACCESS_KEY=your-secret-access-key
23+
#AWS_REGION=your-region
24+
#AWS_S3_BUCKET_NAME=your-bucket-name

README.md

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -464,7 +464,7 @@ curl -X POST "http://localhost:8000/llm/generate" -H "Content-Type: application/
464464

465465
The tool can automatically save the results using different storage strategies and storage profiles. Storage profiles are set in the `/storage_profiles` by a yaml configuration files.
466466

467-
Example:
467+
### Local File System
468468

469469
```yaml
470470
strategy: local_filesystem
@@ -474,7 +474,7 @@ settings:
474474
create_subfolders: true
475475
```
476476
477-
for Google drive:
477+
### Google Drive
478478
479479
```yaml
480480
strategy: google_drive
@@ -489,8 +489,56 @@ Where the `service_account_file` is a `json` file with authorization credentials
489489

490490
Note: Service Account is different account that the one you're using for Google workspace (files will not be visible in the UI)
491491

492+
### Amazon S3 - Cloud Object Storage
493+
494+
```yaml
495+
strategy: aws_s3
496+
settings:
497+
bucket_name: ${AWS_S3_BUCKET_NAME}
498+
region: ${AWS_REGION}
499+
access_key: ${AWS_ACCESS_KEY_ID}
500+
secret_access_key: ${AWS_SECRET_ACCESS_KEY}
501+
```
502+
503+
#### Requirements for AWS S3 Access Key
504+
505+
1. **Access Key Ownership**
506+
The access key must belong to an IAM user or role with permissions for S3 operations.
507+
508+
2. **IAM Policy Example**
509+
The IAM policy attached to the user or role must allow the necessary actions. Below is an example of a policy granting access to an S3 bucket:
510+
```json
511+
{
512+
"Version": "2012-10-17",
513+
"Statement": [
514+
{
515+
"Effect": "Allow",
516+
"Action": [
517+
"s3:PutObject",
518+
"s3:GetObject",
519+
"s3:ListBucket",
520+
"s3:DeleteObject"
521+
],
522+
"Resource": [
523+
"arn:aws:s3:::your-bucket-name",
524+
"arn:aws:s3:::your-bucket-name/*"
525+
]
526+
}
527+
]
528+
}
529+
```
530+
531+
Next, populate the appropriate `.env` file (e.g., .env, .env.localhost) with the required AWS credentials:
532+
533+
```bash
534+
AWS_ACCESS_KEY_ID=your-access-key-id
535+
AWS_SECRET_ACCESS_KEY=your-secret-access-key
536+
AWS_REGION=your-region
537+
AWS_S3_BUCKET_NAME=your-bucket-name
538+
```
539+
492540
## License
493-
This project is licensed under the GNU General Public License. See the [LICENSE](LICENSE.md) file for details.
541+
This project is licensed under the GNU General Public License. See the [LICENSE](LICENSE) file for details.
494542

495543
**Important note on [marker](https://github.com/VikParuchuri/marker) license***:
496544

app/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,4 @@ google-auth-oauthlib
1717
transformers==4.45.2
1818
surya-ocr==0.4.14
1919
marker-pdf==0.2.6
20+
boto3

app/storage_manager.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,22 @@
22
import yaml
33
from storage_strategies.local_filesystem import LocalFilesystemStorageStrategy
44
from storage_strategies.google_drive import GoogleDriveStorageStrategy
5+
from storage_strategies.aws_s3 import AWSS3StorageStrategy
56
from pathlib import Path
67

78
class StorageManager:
89
def __init__(self, profile_name):
910
profile_path = os.path.join(os.getenv('STORAGE_PROFILE_PATH', '/storage_profiles'), f'{profile_name}.yaml')
1011
with open(profile_path, 'r') as file:
1112
self.profile = yaml.safe_load(file)
12-
13+
1314
strategy = self.profile['strategy']
1415
if strategy == 'local_filesystem':
1516
self.strategy = LocalFilesystemStorageStrategy(self.profile)
1617
elif strategy == 'google_drive':
1718
self.strategy = GoogleDriveStorageStrategy(self.profile)
19+
elif strategy == 'aws_s3':
20+
self.strategy = AWSS3StorageStrategy(self.profile)
1821
else:
1922
raise ValueError(f"Unknown storage strategy '{strategy}'")
2023

@@ -28,4 +31,4 @@ def list(self):
2831
return self.strategy.list()
2932

3033
def delete(self, file_name):
31-
self.strategy.delete(file_name)
34+
self.strategy.delete(file_name)

app/storage_strategies/aws_s3.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import boto3
2+
from botocore.exceptions import EndpointConnectionError, ClientError
3+
from storage_strategies.storage_strategy import StorageStrategy
4+
5+
class AWSS3StorageStrategy(StorageStrategy):
6+
def __init__(self, context):
7+
super().__init__(context)
8+
9+
self.bucket_name = self.resolve_placeholder(context['settings'].get('bucket_name'))
10+
self.region = self.resolve_placeholder(context['settings'].get('region'))
11+
self.access_key = self.resolve_placeholder(context['settings'].get('access_key'))
12+
self.secret_access_key = self.resolve_placeholder(context['settings'].get('secret_access_key'))
13+
14+
try:
15+
self.s3_client = boto3.client(
16+
's3',
17+
aws_access_key_id=self.access_key,
18+
aws_secret_access_key=self.secret_access_key,
19+
region_name=self.region
20+
)
21+
self.s3_client.head_bucket(Bucket=self.bucket_name)
22+
except EndpointConnectionError as e:
23+
raise RuntimeError(
24+
f"{str(e)}\n"
25+
"Check your AWS_REGION and AWS_S3_BUCKET_NAME environment variables."
26+
) from e
27+
except ClientError as e:
28+
error_code = e.response.get('Error', {}).get('Code', 'Unknown')
29+
if error_code in ('400', '403'):
30+
raise RuntimeError(
31+
f"{str(e)}\n"
32+
"Error: Please check your AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY."
33+
) from e
34+
raise
35+
36+
def save(self, file_name, dest_file_name, content):
37+
formatted_file_name = self.format_file_name(file_name, dest_file_name)
38+
39+
try:
40+
self.s3_client.put_object(
41+
Bucket=self.bucket_name,
42+
Key=formatted_file_name,
43+
Body=content.encode('utf-8')
44+
)
45+
except ClientError as e:
46+
raise RuntimeError(
47+
f"{str(e)}\n"
48+
f"Error saving file '{file_name}' as '{formatted_file_name}' to bucket '{self.bucket_name}'."
49+
) from e
50+
51+
def load(self, file_name):
52+
try:
53+
response = self.s3_client.get_object(Bucket=self.bucket_name, Key=file_name)
54+
return response['Body'].read().decode('utf-8')
55+
except ClientError as e:
56+
error_code = e.response['Error']['Code']
57+
if error_code == 'NoSuchKey':
58+
return None
59+
raise RuntimeError(
60+
f"{str(e)}\n"
61+
f"Error loading file '{file_name}' from bucket '{self.bucket_name}'."
62+
) from e
63+
64+
def list(self):
65+
try:
66+
response = self.s3_client.list_objects_v2(Bucket=self.bucket_name)
67+
return [item['Key'] for item in response.get('Contents', [])]
68+
except ClientError as e:
69+
raise RuntimeError(
70+
f"{str(e)}\n"
71+
f"Error listing objects in bucket '{self.bucket_name}'."
72+
) from e
73+
74+
def delete(self, file_name):
75+
try:
76+
self.s3_client.delete_object(Bucket=self.bucket_name, Key=file_name)
77+
except ClientError as e:
78+
raise RuntimeError(
79+
f"{str(e)}\n"
80+
f"Error deleting file '{file_name}' from bucket '{self.bucket_name}'."
81+
) from e

app/storage_strategies/storage_strategy.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import yaml
33
from datetime import datetime
44
from pathlib import Path
5+
from string import Template
56

67
class StorageStrategy:
78
def __init__(self, context):
@@ -18,7 +19,7 @@ def list(self):
1819

1920
def delete(self, file_name):
2021
raise NotImplementedError("Subclasses must implement this method")
21-
22+
2223
def format_file_name(self, file_name, format_string):
2324
return format_string.format(file_fullname=file_name, # file_name with path
2425
file_name=Path(file_name).stem, # file_name without path
@@ -28,4 +29,15 @@ def format_file_name(self, file_name, format_string):
2829
dd=datetime.now().strftime('%d'),
2930
HH=datetime.now().strftime('%H'),
3031
MM=datetime.now().strftime('%M'),
31-
SS=datetime.now().strftime('%S'))
32+
SS=datetime.now().strftime('%S'))
33+
34+
def resolve_placeholder(self, value, default=None):
35+
if not value:
36+
return default
37+
try:
38+
return Template(value).substitute(os.environ)
39+
except KeyError as e:
40+
if default:
41+
return default
42+
else:
43+
raise ValueError(f"Environment variable '{e.args[0]}' is missing, and no default value is provided.")

storage_profiles/s3.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
strategy: aws_s3
2+
settings:
3+
bucket_name: ${AWS_S3_BUCKET_NAME}
4+
region: ${AWS_REGION}
5+
access_key: ${AWS_ACCESS_KEY_ID}
6+
secret_access_key: ${AWS_SECRET_ACCESS_KEY}

0 commit comments

Comments
 (0)