I need to harvest tables and column names from AWS Glue crawler metadata catalogue. I used boto3
but constantly getting number of 100 tables even though there are more. Setting up NextToken
doesn’t help. Please help if possible.
Desired results is list as follows:
lst = [table_one.col_one, table_one.col_two, table_two.col_one….table_n.col_n]
JavaScript
x
16
16
1
def harvest_aws_crawler():
2
glue = boto3.client('glue', region_name='')
3
response = glue.get_tables(DatabaseName='', NextToken = '')
4
5
#response syntax:
6
#https://boto3.amazonaws.com/v1/documentation/api/1.9.42/reference/services/glue.html#Glue.Client.get_tables
7
crawler_list_tables = []
8
9
for tables in response['TableList']:
10
while (response.get('NextToken') is not None):
11
crawler_list_tables.append(tables['Name'])
12
break
13
print(len(crawler_list_tables))
14
15
harvest_aws_crawler()
16
UPDATED code, still need to have tablename+columnname:
JavaScript
1
26
26
1
def harvest_aws_crawler():
2
glue = boto3.client('glue', region_name='')
3
next_token = ""
4
5
#response syntax:
6
#https://boto3.amazonaws.com/v1/documentation/api/1.9.42/reference/services/glue.html#Glue.Client.get_tables
7
response = glue.get_tables(DatabaseName='', NextToken = next_token)
8
9
tables_from_crawler = []
10
while True:
11
table_list = response['TableList']
12
for table_dict in table_list:
13
table_name = table_dict['Name']
14
15
#append table_name+column_name
16
for columns in table_name['StorageDescriptor']['Columns']:
17
tables_from_crawler.append(table_name + '.' + columns['Name'])
18
19
#tables_from_crawler.append(table_name)
20
next_token = response.get('NextToken')
21
if next_token is None:
22
break
23
print(tables_from_crawler)
24
25
harvest_aws_crawler()
26
Advertisement
Answer
Adding sub-loop did the trick to get table+column result.
JavaScript
1
15
15
1
#harvest aws crawler metadata
2
next_token = ""
3
client = boto3.client('glue',region_name='us-east-1')
4
crawler_tables = []
5
6
while True:
7
response = client.get_tables(DatabaseName = '', NextToken = next_token)
8
for tables in response['TableList']:
9
for columns in tables['StorageDescriptor']['Columns']:
10
crawler_tables.append(tables['Name'] + '.' + columns['Name'])
11
next_token = response.get('NextToken')
12
if next_token is None:
13
break
14
print(crawler_tables)
15