Skip to content

Commit b1384a8

Browse files
author
YYC
committed
Extending readme and cleanup of some code
1 parent 1f13665 commit b1384a8

File tree

4 files changed

+45
-9
lines changed

4 files changed

+45
-9
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,3 +127,6 @@ dmypy.json
127127

128128
# Pyre type checker
129129
.pyre/
130+
131+
# Zip files
132+
.zip

README.md

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,32 @@
11
# lambda-appflow-ga
2-
Lambda function in python to transform AWS Appflow Google Analytics data
2+
Lambda function in python to transform AWS Appflow Google Analytics data to tabular data.
3+
4+
## Description
5+
This lambda is to be used in combination with Appflow for google analytics.
6+
The lambda will trigger every time a file is written to S3 and will convert the RAW JSON to a table
7+
structure that can be written to Postgresql.
8+
Only google analytics has been tested, test compatibility with others if needed.
9+
10+
This solution is part of a larger blog on medium, check the complete story there (`https://medium.com/@yvescallaert`)
11+
12+
13+
## Requirements
14+
You will need the following to contribute to this project
15+
* Python 3.7
16+
* Pip install of requirements.txt
17+
* Pip install boto3
18+
19+
## Deploying
20+
A deployment to AWS can be done using the following command from the root of the directory
21+
`./packager.sh -n test_appflow -z test_appflow -p webanalytics -r eu-central-1`
22+
23+
Note that the script will look for an IAM role called `lambda-cli-role`, if this role is not present you will need to
24+
create it. The role should contain the following policies:
25+
* AWSLambdaBasicExecutionRole
26+
* AWSLambdaVPCAccessExecutionRole
27+
* SecretsManagerReadWrite (optional if you don't want to store your secrets in AWS Secrets Manager)
28+
* Custom Policy that gives full access to the S3 destionation folder used in AppFlow
29+
30+
## Note
31+
Please note that this script has been tested with a max of 2 metrics and 5 dimensions. The total result during testing
32+
was never bigger than 2K records. If your use case is over 2K records you might need to alter parts of this script.

appflow_ga.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
logger.removeHandler(handler)
1313
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
1414

15+
1516
class PostgressDB():
1617
def __init__(self, username, password, port, host, database):
1718
self.user_name = username
@@ -45,7 +46,7 @@ def process_json_file(jsonf):
4546
"""
4647
logger.info("Starting conversion JSON format to table format.")
4748
logger.info("Detecting {} valid JSON structures in the objects".format(str(len(jsonf))))
48-
#JsonF is a list but the cols and metrics will always be the same across multiple jsons for 1 file
49+
# JsonF is a list but the cols and metrics will always be the same across multiple jsons for 1 file
4950
cols = []
5051
try:
5152
cols = [r for r in jsonf[0]['reports'][0]['columnHeader']['dimensions']]
@@ -57,15 +58,14 @@ def process_json_file(jsonf):
5758
except:
5859
logger.warning("No metrics specified.")
5960

60-
6161
pd_result = None
6262

6363
for list_index in range(len(jsonf)):
6464
data_rows = [r for r in jsonf[list_index]['reports'][0]['data']['rows']]
6565
dim_result_dict = {}
6666

6767
for row in data_rows:
68-
#if there are dimensions, extract the dimension data and add values per key
68+
# if there are dimensions, extract the dimension data and add values per key
6969
for i in range(len(cols)):
7070
if cols[i] in dim_result_dict.keys():
7171
data_list = dim_result_dict[cols[i]]
@@ -82,25 +82,26 @@ def process_json_file(jsonf):
8282
dim_result_dict.update({metrics[i]: data_list})
8383
else:
8484
dim_result_dict[metrics[i]] = [row['metrics'][0]['values'][i]]
85-
#Create dataframe for the first JSON object otherwise append to existing
85+
# Create dataframe for the first JSON object otherwise append to existing
8686
if list_index == 0:
8787
pd_result = pd.DataFrame.from_dict(dim_result_dict)
8888
else:
8989
pd_result = pd_result.append(pd.DataFrame.from_dict(dim_result_dict))
9090
logger.info("Finished conversion JSON format to table format.")
9191
return pd_result
9292

93+
9394
def lambda_handler(event, context):
9495
logger.info("Starting appflow conversion")
9596
bucket_name = event['Records'][0]['s3']['bucket']['name']
9697
object_key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'])
9798
s3_client = boto3.client('s3')
9899

99-
logger.info("Processing bucket {}, filename {}".format(bucket_name,object_key))
100+
logger.info("Processing bucket {}, filename {}".format(bucket_name, object_key))
100101

101102
raw_object = s3_client.get_object(Bucket=bucket_name, Key=object_key)
102103
raw_data = json.loads('[' + raw_object['Body'].read().decode('utf-8').replace('}\n{', '},{') + ']')
103-
#Raw data is always a list of JSON objects
104+
# Raw data is always a list of JSON objects
104105
pd_result = process_json_file(raw_data)
105106

106107
db = PostgressDB(username=os.getenv("DB_USERNAME"),

packager.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ nFlag=0
77
zFlag=0
88

99
CreateZipFile(){
10+
rm -rf package
11+
mkdir -p package
12+
pip install -r requirements.txt --target package/
1013
cd package
1114
zip -r9 ../${ZIP_NAME}.zip .
1215
cd ../
@@ -15,7 +18,7 @@ zip -g ${ZIP_NAME}.zip appflow_ga.py
1518

1619

1720
UsageMessage(){
18-
echo "usage : packager -p <aws profile> -r <aws_region> -n <name of the function> -z <zipname> -h <dbhost> -i <GOOGLE_SPREADSHEET_ID> -t <GOOGLE_SPREADSHEET_TAB_PAGE> -d <DB_TEMP_TABLE>"
21+
echo "usage : packager -p <aws profile> -r <aws_region> -n <name of the function> -z <zipname>"
1922
}
2023

2124
while getopts "p:r:n:z:" option
@@ -48,7 +51,6 @@ fi
4851
CreateZipFile
4952

5053

51-
5254
function_name_cli=`aws lambda get-function --function-name ${FUNCTION_NAME} --profile ${AWS_PROFILE} --region=${AWS_REGION} | jq -r '.Configuration | .FunctionName'`
5355

5456
if [ ${function_name_cli} == ${FUNCTION_NAME} ]; then

0 commit comments

Comments
 (0)