import filecmp comparison = filecmp.dircmp(dir_local, dir_server) comparison.report_full_closure()
I want to compare all CSV files kept on my local machine to files kept on a server. The folder structure is the same for both of them. I only want to do a data comparison and not metadata (like time of creation, etc). I am using filecmp
but it seems to perform metadata comparison. Is there a way to do what I want?
Advertisement
Answer
There are multiple ways to compare the .csv files between the 2 repositories (server file system and local file system).
Method 1: using hashlib
This method uses the Python module hashlib. I used the hashing algorithm sha256 to compute the hash digest for the files. I compare the hashes for files with the exact file name. This method works well, but it will overlook any file that doesn’t exist in both directories.
import hashlib def compare_common_files_by_hash(directory_one, directory_two): d1_files = set(os.listdir(directory_one)) d2_files = set(os.listdir(directory_two)) common_files = list(d1_files & d2_files) if common_files: for filename in common_files: hash_01 = hashlib.sha256(open(f'{directory_one}/{filename}', 'rb').read()).hexdigest() hash_02 = hashlib.sha256(open(f'{directory_two}/{filename}', 'rb').read()).hexdigest() if hash_01 == hash_02: print(f'The file - {filename} is identical in the directories {directory_one} and {directory_two}') elif hash_01 != hash_02: print(f'The file - {filename} is different in the directories {directory_one} and {directory_two}')
Method 2: using os st_size
This method uses the Python module os. In this example, I compared the size of files. This method works ok, but it will misclassify any file that has any data change that doesn’t change the size of the file.
import os def compare_common_files_by_size(directory_one, directory_two): d1_files = set(os.listdir(directory_one)) d2_files = set(os.listdir(directory_two)) common_files = list(d1_files & d2_files) if common_files: for filename in common_files: file_01 = os.stat(f'{directory_one}/{filename}') file_02 = os.stat(f'{directory_two}/{filename}') if file_01.st_size == file_02.st_size: print(f'The file - {filename} is identical in the directories {directory_one} and {directory_two}') elif file_01.st_size != file_02.st_size: print(f'The file - {filename} is different in the directories {directory_one} and' f' {directory_two}')
Method 3: using os st_size and st_mtime
This method also uses the Python module os. In this example, I compared not only the size of the file, but also the last modification time. This method works good, but it will misclassify files as being identical. In testing, I saved a file with no data modifications and os.st_mtime flagged the file as being different, but in reality it wasn’t really different.
import os def compare_common_files_by_metadata(directory_one, directory_two): d1_files = set(os.listdir(directory_one)) d2_files = set(os.listdir(directory_two)) common_files = list(d1_files & d2_files) if common_files: for filename in common_files: file_01 = os.stat(f'{directory_one}/{filename}') file_02 = os.stat(f'{directory_two}/{filename}') if file_01.st_size == file_02.st_size and file_01.st_mtime == file_02.st_mtime: print(f'The file - {filename} is identical in the directories {directory_one} and {directory_two}') elif file_01.st_size != file_02.st_size or file_01.st_mtime != file_02.st_mtime: print(f'The file - {filename} is different in the directories {directory_one} and' f' {directory_two}')
Method 4: using set()
This example uses Python set() to determine the line to line differences between 2 csv files with the same name. This method will output the exact change between the 2 csv files.
import os def compare_common_files_by_lines(directory_one, directory_two): d1_files = set(os.listdir(directory_one)) d2_files = set(os.listdir(directory_two)) common_files = list(d1_files & d2_files) if common_files: for filename in common_files: if fileName.endswith('.csv'): file_01 = open(f'{directory_one}/{filename}', 'r', encoding='ISO-8859-1') file_02 = open(f'{directory_two}/{filename}', 'r', encoding='ISO-8859-1') csv_file_01 = set(map(tuple, csv.reader(file_01))) csv_file_02 = set(map(tuple, csv.reader(file_02))) different = csv_file_01 ^ csv_file_02 for row in sorted(different, key=lambda x: x, reverse=True): if row: print(f'This row: n {row} n was different between the file {fileName} in the directories' f' {directory_one} and {directory_two}')
Method 5: using filecmp.cmp
This method uses the Python module filecmp. In this example I used filecmp.cmp with shallow set to False. Setting this parameter to False instructs filecmp to look at the contents of the files and not the metadata, such as filesize, which is the default for filecmp.cmp. This method works as well as Method 1, that used hashlib.
import filecmp def compare_common_files(directory_one, directory_two): d1_files = set(os.listdir(directory_one)) d2_files = set(os.listdir(directory_two)) common_files = list(d1_files & d2_files) if common_files: for filename in common_files: file_01 = f'{directory_one}/{filename}' file_02 = f'{directory_two}/{filename}' comparison = filecmp.cmp(file_01, file_02, shallow=False) if comparison: print(f'The file - {filename} is identical in the directories - {directory_one} and {directory_two}') elif not comparison: print(f'The file - {filename} is different in the directories - {directory_one} and {directory_two}')
Method 6: using filecmp.dircmp
This method also uses the Python module filecmp. In this example I used filecmp.dircmp, which allows me to not only identify files that are non-common between the 2 directories and find those files that have similar names, but different content.
import filecmp def directory_recursive(directory_one, directory_two): files = filecmp.dircmp(directory_one, directory_two) for filename in files.diff_files: print(f'The file - {filename} is different in the directories - {files.left} and {files.right}') for filename in files.left_only: print(f'The file - {filename} - was only found in the directory {files.left}') for filename in files.right_only: print(f'The file - {filename} - was only found in the directory {files.right}')
Method 7: line-by-line comparison
This example does a line-by-line comparison of 2 csv files and output the line that are different. The output can be added to either Python dictionary or to JSON file for secondary.
import csv def get_csv_file_lines(file): with open(file, 'r', encoding='utf-8') as csv_file: rows = csv.reader(csv_file) for row in rows: yield row def compare_csv_files_line_by_line(csv_file_one, csv_file_two): csvfile_02 = get_csv_file_lines(csv_file_two) for line_one in get_csv_file_lines(csv_file_one): line_two = csvfile_02.__next__() if line_two != line_one: print('File names being compared:') print(f'csv_file_one: {csv_file_one}') print(f'csv_file_two: {csv_file_two}') print(f'The following rows have difference in the files being compared.') print('csv_file_one:', line_one) print('csv_file_two:', line_two) print('n')
Local file system to S3 bucket using hashlib
The example below is a real world use case for comparing files between a local file system and a remote S3 bucket. I originally was going to use object.e_tag that AWS S3 creates, but that tag can have issues and shouldn’t be used in a hashing comparison operation. I decided to query S3 and load an individual file into a memory file system that could be queried and emptied during each comparison operation. This method worked very well and have no adverse impact to my system performance.
import fs import os import boto3 import hashlib def create_temp_memory_filesystem(): mem_fs = fs.open_fs('mem://') virtual_disk = mem_fs.makedir('hidden_dir') return mem_fs, virtual_disk def query_s3_file_by_name(filename, memory_filesystem, temp_directory): s3 = boto3.resource('s3', aws_access_key_id='your_access_key_id', aws_secret_access_key='your_secret_access_key') bucket = s3.Bucket('your_bucket_name') for obj in bucket.objects.all(): if obj.key == filename: body = obj.get()['Body'].read() with memory_filesystem.open(f'{temp_directory}/s3_{filename}', 'w') as f: f.write(str(body)) f.close() def compare_local_files_to_s3_files(local_csv_files): virtual_disk = create_temp_memory_filesystem() directory_name = str(virtual_disk[1]).split('/')[1] files = set(os.listdir(local_csv_files)) for filename in files: if filename.endswith('.csv'): local_file_hash = hashlib.sha256(open(f'{local_csv_files}/{filename}', 'rb').read()).hexdigest() query_s3_file_by_name(filename, virtual_disk[0], directory_name) virtual_files = virtual_disk[0].opendir(directory_name) for file_name in virtual_files.listdir('/'): s3_file_hash = hashlib.sha256(open(file_name, 'rb').read()).hexdigest() if local_file_hash == s3_file_hash: print(f'The file - {filename} is identical in both the local file system and the S3 bucket.') elif local_file_hash != s3_file_hash: print(f'The file - {filename} is different between the local file system and the S3 bucket.') virtual_files.remove(file_name) virtual_disk[0].close()
Local file system to S3 bucket using filecmp
This example is the same as the one above except I use filecmp.cmp instead of hashlib for the comparison operation.
import fs import os import boto3 import filecmp def create_temp_memory_filesystem(): mem_fs = fs.open_fs('mem://') virtual_disk = mem_fs.makedir('hidden_dir') return mem_fs, virtual_disk def query_s3_file_by_name(filename, memory_filesystem, temp_directory): s3 = boto3.resource('s3', aws_access_key_id='your_access_key_id', aws_secret_access_key='your_secret_access_key') bucket = s3.Bucket('your_bucket_name') for obj in bucket.objects.all(): if obj.key == filename: body = obj.get()['Body'].read() with memory_filesystem.open(f'{temp_directory}/s3_{filename}', 'w') as f: f.write(str(body)) f.close() def compare_local_files_to_s3_files(local_csv_files): virtual_disk = create_temp_memory_filesystem() directory_name = str(virtual_disk[1]).split('/')[1] files = set(os.listdir(local_csv_files)) for filename in files: if filename.endswith('.csv'): local_file = f'{local_csv_files}/{filename}' query_s3_file_by_name(filename, virtual_disk[0], directory_name) virtual_files = virtual_disk[0].opendir(directory_name) for file_name in virtual_files.listdir('/'): comparison = filecmp.cmp(local_file, file_name, shallow=False) if comparison: print(f'The file - {filename} is identical in both the local file system and the S3 bucket.') elif not comparison: print(f'The file - {filename} is different between the local file system and the S3 bucket.') virtual_files.remove(file_name) virtual_disk[0].close()
Local file system to Google Cloud storage bucket using hashlib
This example is similar to the S3 hashlib code example above, but it uses a Google Cloud storage bucket.
import fs import os import hashlib from google.cloud import storage def create_temp_memory_filesystem(): mem_fs = fs.open_fs('mem://') virtual_disk = mem_fs.makedir('hidden_dir') return mem_fs, virtual_disk def query_google_cloud_storage_file_by_name(filename, memory_filesystem, temp_directory): client = storage.Client.from_service_account_json('path_to_your_credentials.json') bucket = client.get_bucket('your_bucket_name') blobs = bucket.list_blobs() for blob in blobs: if blob.name == filename: with memory_filesystem.open(f'{temp_directory}/{filename}', 'w') as f: f.write(str(blob.download_to_filename(blob.name))) f.close() def compare_local_files_to_google_storage_files(local_csv_files): virtual_disk = create_temp_memory_filesystem() directory_name = str(virtual_disk[1]).split('/')[1] files = set(os.listdir(local_csv_files)) for filename in files: if filename.endswith('.csv'): local_file_hash = hashlib.sha256(open(f'{local_csv_files}/{filename}', 'rb').read()).hexdigest() query_google_cloud_storage_file_by_name(filename, virtual_disk[0], directory_name) virtual_files = virtual_disk[0].opendir(directory_name) for file_name in virtual_files.listdir('/'): gs_file_hash = hashlib.sha256(open(file_name, 'rb').read()).hexdigest() if local_file_hash == gs_file_hash: print(f'The file - {filename} is identical in both the local file system and the Google Cloud bucket.') elif local_file_hash != gs_file_hash: print(f'The file - {filename} is different between the local file system and the Google Cloud bucket.') virtual_files.remove(file_name) virtual_disk[0].close()
Local file system to Google Cloud storage bucket using filecmp
This example is similar to the S3 filecmp code example above, but it uses a Google Cloud storage bucket.
import fs import os import filecmp from google.cloud import storage def create_temp_memory_filesystem(): mem_fs = fs.open_fs('mem://') virtual_disk = mem_fs.makedir('hidden_dir') return mem_fs, virtual_disk def query_google_cloud_storage_file_by_name(filename, memory_filesystem, temp_directory): client = storage.Client.from_service_account_json('path_to_your_credentials.json') bucket = client.get_bucket('your_bucket_name') blobs = bucket.list_blobs() for blob in blobs: if blob.name == filename: with memory_filesystem.open(f'{temp_directory}/{filename}', 'w') as f: f.write(str(blob.download_to_filename(blob.name))) f.close() def compare_local_files_to_google_storage_files(local_csv_files): virtual_disk = create_temp_memory_filesystem() directory_name = str(virtual_disk[1]).split('/')[1] files = set(os.listdir(local_csv_files)) for filename in files: if filename.endswith('.csv'): local_file = f'{local_csv_files}/{filename}' query_google_cloud_storage_file_by_name(filename, virtual_disk[0], directory_name) virtual_files = virtual_disk[0].opendir(directory_name) for file_name in virtual_files.listdir('/'): comparison = filecmp.cmp(local_file, file_name, shallow=False) if comparison: print(f'The file - {filename} is identical in both the local file system and the Google Cloud bucket.') elif not comparison: print(f'The file - {filename} is different between the local file system and the Google Cloud bucket.') virtual_files.remove(file_name) virtual_disk[0].close()