|
| 1 | +import json |
| 2 | +import pandas as pd |
| 3 | +import boto3 |
| 4 | +from sqlalchemy import create_engine |
| 5 | +import os |
| 6 | +import urllib.parse |
| 7 | +import logging |
| 8 | + |
| 9 | +logger = logging.getLogger() |
| 10 | +if logger.handlers: |
| 11 | + for handler in logger.handlers: |
| 12 | + logger.removeHandler(handler) |
| 13 | +logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) |
| 14 | + |
| 15 | +class PostgressDB(): |
| 16 | + def __init__(self, username, password, port, host, database): |
| 17 | + self.user_name = username |
| 18 | + self.password = password |
| 19 | + self.port = port |
| 20 | + self.host = host |
| 21 | + self.database = database |
| 22 | + self.conn_string = self.__get_connect_string() |
| 23 | + |
| 24 | + def __get_connect_string(self): |
| 25 | + """ |
| 26 | + Build the connection string for postgres |
| 27 | + :return: String: valid connection string for sqlalchemy engine |
| 28 | + """ |
| 29 | + return 'postgresql://{}:{}@{}:{}/{}'.format(self.user_name, self.password, self.host, self.port, self.database) |
| 30 | + |
| 31 | + def create_engine(self): |
| 32 | + """ |
| 33 | + return a create_engine object with pooling |
| 34 | + :return: |
| 35 | + """ |
| 36 | + return create_engine(self.conn_string, pool_size=20, max_overflow=0) |
| 37 | + |
| 38 | + |
| 39 | +def process_json_file(jsonf): |
| 40 | + """ |
| 41 | + Process the result from appflow google analytics extraction. The json structure needs to |
| 42 | + converted to a table structure |
| 43 | + :param jsonf: JSON structure |
| 44 | + :return: Pandas Dataframe |
| 45 | + """ |
| 46 | + logger.info("Starting conversion JSON format to table format.") |
| 47 | + logger.info("Detecting {} valid JSON structures in the objects".format(str(len(jsonf)))) |
| 48 | + #JsonF is a list but the cols and metrics will always be the same across multiple jsons for 1 file |
| 49 | + cols = [] |
| 50 | + try: |
| 51 | + cols = [r for r in jsonf[0]['reports'][0]['columnHeader']['dimensions']] |
| 52 | + except: |
| 53 | + logger.warning("No dimensions specified.") |
| 54 | + metrics = [] |
| 55 | + try: |
| 56 | + metrics = [r['name'] for r in jsonf[0]['reports'][0]['columnHeader']['metricHeader']['metricHeaderEntries']] |
| 57 | + except: |
| 58 | + logger.warning("No metrics specified.") |
| 59 | + |
| 60 | + |
| 61 | + pd_result = None |
| 62 | + |
| 63 | + for list_index in range(len(jsonf)): |
| 64 | + data_rows = [r for r in jsonf[list_index]['reports'][0]['data']['rows']] |
| 65 | + dim_result_dict = {} |
| 66 | + |
| 67 | + for row in data_rows: |
| 68 | + #if there are dimensions, extract the dimension data and add values per key |
| 69 | + for i in range(len(cols)): |
| 70 | + if cols[i] in dim_result_dict.keys(): |
| 71 | + data_list = dim_result_dict[cols[i]] |
| 72 | + data_list.append(row['dimensions'][i]) |
| 73 | + dim_result_dict.update({cols[i]: data_list}) |
| 74 | + else: |
| 75 | + dim_result_dict[cols[i]] = [row['dimensions'][i]] |
| 76 | + |
| 77 | + # if there are metrics, extract the metrics data and add values per key |
| 78 | + for i in range(len(metrics)): |
| 79 | + if metrics[i] in dim_result_dict.keys(): |
| 80 | + data_list = dim_result_dict[metrics[i]] |
| 81 | + data_list.append(row['metrics'][0]['values'][i]) |
| 82 | + dim_result_dict.update({metrics[i]: data_list}) |
| 83 | + else: |
| 84 | + dim_result_dict[metrics[i]] = [row['metrics'][0]['values'][i]] |
| 85 | + #Create dataframe for the first JSON object otherwise append to existing |
| 86 | + if list_index == 0: |
| 87 | + pd_result = pd.DataFrame.from_dict(dim_result_dict) |
| 88 | + else: |
| 89 | + pd_result = pd_result.append(pd.DataFrame.from_dict(dim_result_dict)) |
| 90 | + logger.info("Finished conversion JSON format to table format.") |
| 91 | + return pd_result |
| 92 | + |
| 93 | +def lambda_handler(event, context): |
| 94 | + logger.info("Starting appflow conversion") |
| 95 | + bucket_name = event['Records'][0]['s3']['bucket']['name'] |
| 96 | + object_key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key']) |
| 97 | + s3_client = boto3.client('s3') |
| 98 | + |
| 99 | + logger.info("Processing bucket {}, filename {}".format(bucket_name,object_key)) |
| 100 | + |
| 101 | + raw_object = s3_client.get_object(Bucket=bucket_name, Key=object_key) |
| 102 | + raw_data = json.loads('[' + raw_object['Body'].read().decode('utf-8').replace('}\n{', '},{') + ']') |
| 103 | + #Raw data is always a list of JSON objects |
| 104 | + pd_result = process_json_file(raw_data) |
| 105 | + |
| 106 | + db = PostgressDB(username=os.getenv("DB_USERNAME"), |
| 107 | + password=os.getenv("DB_PASSWORD"), |
| 108 | + port=5432, |
| 109 | + host=os.getenv("DB_HOST"), |
| 110 | + database=os.getenv("DB_DATABASE")) |
| 111 | + db_tmp_table = os.getenv("DB_TABLE_TMP") |
| 112 | + logger.info("Writing data to the table {}".format(db_tmp_table)) |
| 113 | + |
| 114 | + pd_result.to_sql(name=db_tmp_table, con=db.create_engine(), index=False, if_exists='replace') |
| 115 | + |
| 116 | + logger.info("Finished appflow conversion") |
0 commit comments