Compare CSV files content with filecmp and ignore metadata

Question

I want to compare all CSV files kept on my local machine to files kept on a server. The folder structure is the same for both of them. I only want to do a data comparison and not metadata (like time of creation, etc). I am using filecmp but it seems to perform metadata comparison. Is there a way to

Accepted Answer

There are multiple ways to compare the .csv files between the 2 repositories (server file system and local file system).Method 1: using hashlibThis method uses the Python module hashlib. I used the hashing algorithm sha256 to compute the hash digest for the files. I compare the hashes for files with the exact file name. This method works well, but it will overlook any file that doesn&#8217;t exist in both directories.import hashlibdef compare_common_files_by_hash(directory_one, directory_two):   d1_files = set(os.listdir(directory_one))   d2_files = set(os.listdir(directory_two))   common_files = list(d1_files &  d2_files)   if common_files:     for filename in common_files:        hash_01 = hashlib.sha256(open(f'{directory_one}/{filename}', 'rb').read()).hexdigest()        hash_02 = hashlib.sha256(open(f'{directory_two}/{filename}', 'rb').read()).hexdigest()        if hash_01 == hash_02:            print(f'The file - {filename} is identical in the directories {directory_one} and {directory_two}')        elif hash_01 != hash_02:            print(f'The file - {filename} is different in the directories {directory_one} and {directory_two}')Method 2: using os st_sizeThis method uses the Python module os. In this example, I compared the size of files. This method works ok, but it will misclassify any file that has any data change that doesn&#8217;t change the size of the file.import os def compare_common_files_by_size(directory_one, directory_two):  d1_files = set(os.listdir(directory_one))  d2_files = set(os.listdir(directory_two))  common_files = list(d1_files &  d2_files)  if common_files:    for filename in common_files:       file_01 = os.stat(f'{directory_one}/{filename}')       file_02 = os.stat(f'{directory_two}/{filename}')       if file_01.st_size == file_02.st_size:            print(f'The file - {filename} is identical in the directories {directory_one} and {directory_two}')       elif file_01.st_size != file_02.st_size:            print(f'The file - {filename} is different in the directories {directory_one} and'                  f' {directory_two}')Method 3: using os st_size and st_mtimeThis method also uses the Python module os. In this example, I compared not only the size of the file, but also the last modification time.  This method works good, but it will misclassify files as being identical.  In testing, I saved a file with no data modifications and os.st_mtime flagged the file as being different, but in reality it wasn&#8217;t really different.import os def compare_common_files_by_metadata(directory_one, directory_two):   d1_files = set(os.listdir(directory_one))   d2_files = set(os.listdir(directory_two))   common_files = list(d1_files & d2_files)   if common_files:     for filename in common_files:        file_01 = os.stat(f'{directory_one}/{filename}')        file_02 = os.stat(f'{directory_two}/{filename}')        if file_01.st_size == file_02.st_size and file_01.st_mtime == file_02.st_mtime:            print(f'The file - {filename} is identical in the directories {directory_one} and {directory_two}')        elif file_01.st_size != file_02.st_size or file_01.st_mtime != file_02.st_mtime:            print(f'The file - {filename} is different in the directories {directory_one} and'                  f' {directory_two}')Method 4: using set()This example uses Python set() to determine the line to line differences between 2 csv files with the same name. This method will output the exact change between the 2 csv files.import osdef compare_common_files_by_lines(directory_one, directory_two):   d1_files = set(os.listdir(directory_one))   d2_files = set(os.listdir(directory_two))   common_files = list(d1_files & d2_files)   if common_files:     for filename in common_files:        if fileName.endswith('.csv'):          file_01 = open(f'{directory_one}/{filename}', 'r', encoding='ISO-8859-1')          file_02 = open(f'{directory_two}/{filename}', 'r', encoding='ISO-8859-1')          csv_file_01 = set(map(tuple, csv.reader(file_01)))          csv_file_02 = set(map(tuple, csv.reader(file_02)))          different = csv_file_01 ^ csv_file_02            for row in sorted(different, key=lambda x: x, reverse=True):               if row:                  print(f'This row: n {row} n was different between the file {fileName} in the directories'                          f' {directory_one} and {directory_two}')Method 5: using filecmp.cmpThis method uses the Python module filecmp. In this example I used filecmp.cmp with shallow set to False.  Setting this parameter to False  instructs filecmp to look at the contents of the files and not the metadata, such as filesize, which is the default for filecmp.cmp. This method works as well as Method 1, that used hashlib.import filecmpdef compare_common_files(directory_one, directory_two):  d1_files = set(os.listdir(directory_one))  d2_files = set(os.listdir(directory_two))  common_files = list(d1_files & d2_files)  if common_files:    for filename in common_files:        file_01 = f'{directory_one}/{filename}'        file_02 = f'{directory_two}/{filename}'        comparison = filecmp.cmp(file_01, file_02, shallow=False)        if comparison:            print(f'The file - {filename} is identical in the directories - {directory_one} and {directory_two}')        elif not comparison:            print(f'The file - {filename} is different in the directories - {directory_one} and {directory_two}')Method 6: using filecmp.dircmpThis method also uses the Python module filecmp. In this example I used filecmp.dircmp, which allows me to not only identify files that are non-common between the 2 directories and find those files that have similar names, but different content.import filecmpdef directory_recursive(directory_one, directory_two):   files = filecmp.dircmp(directory_one, directory_two)   for filename in files.diff_files:      print(f'The file - {filename} is different in the directories - {files.left} and {files.right}')   for filename in files.left_only:      print(f'The file - {filename} - was only found in the directory {files.left}')   for filename in files.right_only:      print(f'The file - {filename} - was only found in the directory {files.right}')Method 7: line-by-line comparisonThis example does a line-by-line comparison of 2 csv files and output the line that are different.  The output can be added to either Python dictionary or to JSON file for secondary.import csvdef get_csv_file_lines(file):   with open(file, 'r', encoding='utf-8') as csv_file:      rows = csv.reader(csv_file)      for row in rows:         yield rowdef compare_csv_files_line_by_line(csv_file_one, csv_file_two):   csvfile_02 = get_csv_file_lines(csv_file_two)   for line_one in get_csv_file_lines(csv_file_one):      line_two = csvfile_02.__next__()      if line_two != line_one:        print('File names being compared:')        print(f'csv_file_one: {csv_file_one}')        print(f'csv_file_two: {csv_file_two}')        print(f'The following rows have difference in the files being compared.')        print('csv_file_one:', line_one)        print('csv_file_two:', line_two)        print('n')Local file system to S3 bucket using hashlibThe example below is a real world use case for comparing files between a local file system and a remote S3 bucket. I originally was going to use object.e_tag that AWS S3 creates, but that tag can have issues and shouldn&#8217;t be used in a hashing comparison operation.  I decided to query S3 and load an individual file into a memory file system that could be queried and emptied during each comparison operation. This method worked very well and have no adverse impact to my system performance.import fsimport osimport boto3import hashlibdef create_temp_memory_filesystem():   mem_fs = fs.open_fs('mem://')   virtual_disk = mem_fs.makedir('hidden_dir')   return mem_fs, virtual_diskdef query_s3_file_by_name(filename, memory_filesystem, temp_directory):   s3 = boto3.resource('s3', aws_access_key_id='your_access_key_id',                    aws_secret_access_key='your_secret_access_key')   bucket = s3.Bucket('your_bucket_name')   for obj in bucket.objects.all():      if obj.key == filename:        body = obj.get()['Body'].read()        with memory_filesystem.open(f'{temp_directory}/s3_{filename}', 'w') as f:            f.write(str(body))            f.close() def compare_local_files_to_s3_files(local_csv_files):    virtual_disk = create_temp_memory_filesystem()    directory_name = str(virtual_disk[1]).split('/')[1]    files = set(os.listdir(local_csv_files))    for filename in files:       if filename.endswith('.csv'):         local_file_hash = hashlib.sha256(open(f'{local_csv_files}/{filename}', 'rb').read()).hexdigest()         query_s3_file_by_name(filename, virtual_disk[0], directory_name)         virtual_files = virtual_disk[0].opendir(directory_name)         for file_name in virtual_files.listdir('/'):            s3_file_hash = hashlib.sha256(open(file_name, 'rb').read()).hexdigest()            if local_file_hash == s3_file_hash:                print(f'The file - {filename} is identical in both the local file system and the S3 bucket.')            elif local_file_hash != s3_file_hash:                print(f'The file - {filename} is different between the local file system and the S3 bucket.')            virtual_files.remove(file_name)    virtual_disk[0].close()Local file system to S3 bucket using filecmpThis example is the same as the one above except I use filecmp.cmp instead of hashlib for the comparison operation.import fsimport osimport boto3import filecmpdef create_temp_memory_filesystem():   mem_fs = fs.open_fs('mem://')   virtual_disk = mem_fs.makedir('hidden_dir')   return mem_fs, virtual_diskdef query_s3_file_by_name(filename, memory_filesystem, temp_directory):   s3 = boto3.resource('s3', aws_access_key_id='your_access_key_id',                    aws_secret_access_key='your_secret_access_key')   bucket = s3.Bucket('your_bucket_name')   for obj in bucket.objects.all():      if obj.key == filename:        body = obj.get()['Body'].read()        with memory_filesystem.open(f'{temp_directory}/s3_{filename}', 'w') as f:            f.write(str(body))            f.close()def compare_local_files_to_s3_files(local_csv_files):   virtual_disk = create_temp_memory_filesystem()   directory_name = str(virtual_disk[1]).split('/')[1]   files = set(os.listdir(local_csv_files))   for filename in files:      if filename.endswith('.csv'):        local_file = f'{local_csv_files}/{filename}'        query_s3_file_by_name(filename, virtual_disk[0], directory_name)        virtual_files = virtual_disk[0].opendir(directory_name)        for file_name in virtual_files.listdir('/'):            comparison = filecmp.cmp(local_file, file_name, shallow=False)            if comparison:                print(f'The file - {filename} is identical in both the local file system and the S3 bucket.')            elif not comparison:                print(f'The file - {filename} is different between the local file system and the S3 bucket.')            virtual_files.remove(file_name)   virtual_disk[0].close()Local file system to Google Cloud storage bucket using hashlibThis example is similar to the S3 hashlib code example above, but it uses a Google Cloud storage bucket.import fsimport osimport hashlibfrom google.cloud import storagedef create_temp_memory_filesystem():   mem_fs = fs.open_fs('mem://')   virtual_disk = mem_fs.makedir('hidden_dir')   return mem_fs, virtual_diskdef query_google_cloud_storage_file_by_name(filename, memory_filesystem, temp_directory):  client = storage.Client.from_service_account_json('path_to_your_credentials.json')  bucket = client.get_bucket('your_bucket_name')  blobs = bucket.list_blobs()  for blob in blobs:     if blob.name == filename:       with memory_filesystem.open(f'{temp_directory}/{filename}', 'w') as f:           f.write(str(blob.download_to_filename(blob.name)))           f.close()def compare_local_files_to_google_storage_files(local_csv_files):   virtual_disk = create_temp_memory_filesystem()   directory_name = str(virtual_disk[1]).split('/')[1]   files = set(os.listdir(local_csv_files))   for filename in files:      if filename.endswith('.csv'):        local_file_hash = hashlib.sha256(open(f'{local_csv_files}/{filename}', 'rb').read()).hexdigest()        query_google_cloud_storage_file_by_name(filename, virtual_disk[0], directory_name)        virtual_files = virtual_disk[0].opendir(directory_name)        for file_name in virtual_files.listdir('/'):            gs_file_hash = hashlib.sha256(open(file_name, 'rb').read()).hexdigest()            if local_file_hash == gs_file_hash:                print(f'The file - {filename} is identical in both the local file system and the Google Cloud bucket.')            elif local_file_hash != gs_file_hash:                print(f'The file - {filename} is different between the local file system and the Google Cloud bucket.')            virtual_files.remove(file_name)    virtual_disk[0].close()Local file system to Google Cloud storage bucket using filecmpThis example is similar to the S3 filecmp code example above, but it uses a Google Cloud storage bucket. import fs import os import filecmp from google.cloud import storage def create_temp_memory_filesystem():    mem_fs = fs.open_fs('mem://')    virtual_disk = mem_fs.makedir('hidden_dir')    return mem_fs, virtual_disk def query_google_cloud_storage_file_by_name(filename, memory_filesystem, temp_directory):   client = storage.Client.from_service_account_json('path_to_your_credentials.json')   bucket = client.get_bucket('your_bucket_name')   blobs = bucket.list_blobs()   for blob in blobs:      if blob.name == filename:        with memory_filesystem.open(f'{temp_directory}/{filename}', 'w') as f:            f.write(str(blob.download_to_filename(blob.name)))            f.close() def compare_local_files_to_google_storage_files(local_csv_files):   virtual_disk = create_temp_memory_filesystem()   directory_name = str(virtual_disk[1]).split('/')[1]   files = set(os.listdir(local_csv_files))   for filename in files:      if filename.endswith('.csv'):        local_file = f'{local_csv_files}/{filename}'        query_google_cloud_storage_file_by_name(filename, virtual_disk[0], directory_name)        virtual_files = virtual_disk[0].opendir(directory_name)        for file_name in virtual_files.listdir('/'):          comparison = filecmp.cmp(local_file, file_name, shallow=False)          if comparison:            print(f'The file - {filename} is identical in both the local file system and the Google Cloud bucket.')          elif not comparison:                print(f'The file - {filename} is different between the local file system and the Google Cloud bucket.')           virtual_files.remove(file_name)   virtual_disk[0].close()

Advertisement

Answer