Write Large Pandas DataFrames to SQL Server database

Question

I have 74 relatively large Pandas DataFrames (About 34,600 rows and 8 columns) that I am trying to insert into a SQL Server database as quickly as possible. After doing some research, I learned that the good ole pandas.to_sql function is not good for such large inserts into a SQL Server database, which was th…

Accepted Answer

I&#8217;ve got some sad news for you, SQLAlchemy actually doesn&#8217;t implement bulk imports for SQL Server, it&#8217;s actually just going to do the same slow individual INSERT statements that to_sql is doing. I would say that your best bet is to try and script something up using the bcp command line tool. Here is a script that I&#8217;ve used in the past, but no guarantees: from subprocess import check_output, callimport pandas as pdimport numpy as npimport ospad = 0.1tablename = 'sandbox.max.pybcp_test'overwrite=Trueraise_exception = Trueserver = 'P01'trusted_connection= Trueusername=Nonepassword=Nonedelimiter='|'df = pd.read_csv('D:/inputdata.csv', encoding='latin', error_bad_lines=False)def get_column_def_sql(col):   if col.dtype == object:      width = col.str.len().max() * (1+pad)      return '[{}] varchar({})'.format(col.name, int(width))    elif np.issubdtype(col.dtype, float):      return'[{}] float'.format(col.name)    elif np.issubdtype(col.dtype, int):      return '[{}] int'.format(col.name)    else:      if raise_exception:         raise NotImplementedError('data type {} not implemented'.format(col.dtype))      else:         print('Warning: cast column {} as varchar; data type {} not implemented'.format(col, col.dtype))         width = col.str.len().max() * (1+pad)         return '[{}] varchar({})'.format(col.name, int(width)) def create_table(df, tablename, server, trusted_connection, username, password, pad):             if trusted_connection:       login_string = '-E'    else:       login_string = '-U {} -P {}'.format(username, password)    col_defs = []    for col in df:       col_defs += [get_column_def_sql(df[col])]    query_string = 'CREATE TABLE {}n({})nGOnQUIT'.format(tablename, ',n'.join(col_defs))           if overwrite == True:       query_string = "IF OBJECT_ID('{}', 'U') IS NOT NULL DROP TABLE {};".format(tablename, tablename) + query_string    query_file = 'c:\pybcp_tempqueryfile.sql'    with open (query_file,'w') as f:       f.write(query_string)    if trusted_connection:       login_string = '-E'    else:       login_string = '-U {} -P {}'.format(username, password)    o = call('sqlcmd -S {} {} -i {}'.format(server, login_string, query_file), shell=True)    if o != 0:       raise BaseException("Failed to create table")   # o = call('del {}'.format(query_file), shell=True)def call_bcp(df, tablename):       if trusted_connection:       login_string = '-T'    else:       login_string = '-U {} -P {}'.format(username, password)    temp_file = 'c:\pybcp_tempqueryfile.csv'    #remove the delimiter and change the encoding of the data frame to latin so sql server can read it    df.loc[:,df.dtypes == object] = df.loc[:,df.dtypes == object].apply(lambda col: col.str.replace(delimiter,'').str.encode('latin'))    df.to_csv(temp_file, index = False, sep = '|', errors='ignore')    o = call('bcp sandbox.max.pybcp_test2 in c:pybcp_tempqueryfile.csv -S "localhost" -T -t^| -rn -c')

Advertisement

Answer