Skip to content

Commit 9d8356d

Browse files
committed
Modifications to merge aws-samples#66
* Reverted license header * Removed unused codes and fixed indent * Added --database-prefix parameter for from-s3 mode in README
1 parent 705e113 commit 9d8356d

File tree

2 files changed

+6
-23
lines changed

2 files changed

+6
-23
lines changed

utilities/Hive_metastore_migration/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,7 @@ as an Glue ETL job, if AWS Glue can directly connect to your Hive metastore.
243243
- `--database-input-path` set to the S3 path containing only databases. For example: `s3://someBucket/output_path_from_previous_job/databases`
244244
- `--table-input-path` set to the S3 path containing only tables. For example: `s3://someBucket/output_path_from_previous_job/tables`
245245
- `--partition-input-path` set to the S3 path containing only partitions. For example: `s3://someBucket/output_path_from_previous_job/partitions`
246+
- `--database-prefix` (optional) set to a string prefix that is applied to the database name created in AWS Glue Data Catalog. You can use it as a way to track the origin of the metadata, and avoid naming conflicts. The default is the empty string.
246247

247248
Also, because there is no need to connect to any JDBC source, the job doesn't
248249
require any connections.

utilities/Hive_metastore_migration/src/import_into_datacatalog.py

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,8 @@
11
# Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2-
# Licensed under the Amazon Software License (the "License"). You may not use
3-
# this file except in compliance with the License. A copy of the License is
4-
# located at
5-
#
6-
# http://aws.amazon.com/asl/
7-
#
8-
# and in the "LICENSE" file accompanying this file. This file is distributed
9-
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10-
# or implied. See the License for the specific language governing
11-
# permissions and limitations under the License.
12-
2+
# SPDX-License-Identifier: MIT-0
133

144
from __future__ import print_function
155

16-
import logging
17-
import os
18-
196
from pyspark.sql.functions import lit, struct, array, col, concat
207

218
from awsglue.context import GlueContext
@@ -24,11 +11,6 @@
2411
from hive_metastore_migration import *
2512

2613

27-
logging.basicConfig()
28-
logger = logging.getLogger(__name__)
29-
logger.setLevel(getattr(logging, os.getenv('LOG_LEVEL', 'INFO')))
30-
31-
3214
def transform_df_to_catalog_import_schema(sql_context, glue_context, df_databases, df_tables, df_partitions):
3315
df_databases_array = df_databases.select(df_databases['type'], array(df_databases['item']).alias('items'))
3416
df_tables_array = df_tables.select(df_tables['type'], df_tables['database'],
@@ -81,14 +63,14 @@ def metastore_import_from_s3(sql_context, glue_context, db_input_dir, tbl_input_
8163
databases = sql_context.read.json(path=db_input_dir, schema=METASTORE_DATABASE_SCHEMA)
8264
tables = sql_context.read.json(path=tbl_input_dir, schema=METASTORE_TABLE_SCHEMA)
8365
partitions = sql_context.read.json(path=parts_input_dir, schema=METASTORE_PARTITION_SCHEMA)
84-
85-
# Changes to Prefix on database
66+
67+
# Changes to Prefix on database
8668
if db_prefix:
8769
databases = databases.withColumn('item', struct(col('item.description'), col('item.locationUri'), concat(lit(db_prefix),col('item.name')).alias('name'), col('item.parameters')))
8870
tables = tables.withColumn("database",concat(lit(db_prefix),col('database')).alias('database'))
8971
partitions = partitions.withColumn("database",concat(lit(db_prefix),col('database')).alias('database'))
9072
partitions = partitions.withColumn('item', struct(col('item.creationTime'), col('item.creationTime'), concat(lit(db_prefix),col('item.namespaceName')).alias('namespaceName'), col('item.parameters'), col('item.storageDescriptor'), col('item.values')))
91-
73+
9274

9375
# load
9476
import_datacatalog(sql_context, glue_context, datacatalog_name, databases, tables, partitions, region)
@@ -146,7 +128,7 @@ def main():
146128
db_input_dir=options['database_input_path'],
147129
tbl_input_dir=options['table_input_path'],
148130
parts_input_dir=options['partition_input_path'],
149-
db_prefix=options.get('database_prefix') or '',
131+
db_prefix=options.get('database_prefix') or '',
150132
datacatalog_name='datacatalog',
151133
region=options.get('region') or 'us-east-1'
152134
)

0 commit comments

Comments
 (0)