I need to harvest tables and column names from AWS Glue crawler metadata catalogue. I used boto3
but constantly getting number of 100 tables even though there are more. Setting up NextToken
doesn’t help. Please help if possible.
Desired results is list as follows:
lst = [table_one.col_one, table_one.col_two, table_two.col_one….table_n.col_n]
def harvest_aws_crawler(): glue = boto3.client('glue', region_name='') response = glue.get_tables(DatabaseName='', NextToken = '') #response syntax: #https://boto3.amazonaws.com/v1/documentation/api/1.9.42/reference/services/glue.html#Glue.Client.get_tables crawler_list_tables = [] for tables in response['TableList']: while (response.get('NextToken') is not None): crawler_list_tables.append(tables['Name']) break print(len(crawler_list_tables)) harvest_aws_crawler()
UPDATED code, still need to have tablename+columnname:
def harvest_aws_crawler(): glue = boto3.client('glue', region_name='') next_token = "" #response syntax: #https://boto3.amazonaws.com/v1/documentation/api/1.9.42/reference/services/glue.html#Glue.Client.get_tables response = glue.get_tables(DatabaseName='', NextToken = next_token) tables_from_crawler = [] while True: table_list = response['TableList'] for table_dict in table_list: table_name = table_dict['Name'] #append table_name+column_name for columns in table_name['StorageDescriptor']['Columns']: tables_from_crawler.append(table_name + '.' + columns['Name']) #tables_from_crawler.append(table_name) next_token = response.get('NextToken') if next_token is None: break print(tables_from_crawler) harvest_aws_crawler()
Advertisement
Answer
Adding sub-loop did the trick to get table+column result.
#harvest aws crawler metadata next_token = "" client = boto3.client('glue',region_name='us-east-1') crawler_tables = [] while True: response = client.get_tables(DatabaseName = '', NextToken = next_token) for tables in response['TableList']: for columns in tables['StorageDescriptor']['Columns']: crawler_tables.append(tables['Name'] + '.' + columns['Name']) next_token = response.get('NextToken') if next_token is None: break print(crawler_tables)