I have two distinct programs that processed the same input files. During the processing, both programs generated files with names based on their original counterpart. The possible changes, without considering the modified paths and file extensions, would be, in regex terms, as follow:
- Program1 can append
(_[[:alnum:]]+)*
to the original filenames. - Program2 can append
(_[[:alnum:]]+)*
and prepend([[:alnum:]]+_)*
to the original filenames.
Now, given a list of several millions of file paths from Program2, I need to match each one of them to an unique file path from Program1.
What would be a sensible way to do it?
Here’s where I’m stuck:
#!/usr/bin/env python3 import os ##### DATA ##### program1_files = [ # Around 10 thousand of unique items (parsed from a file). 'path1/jobxxx/n1_file_001_xxx.tiff', 'path1/jobxxx/n1_file_002_yyy_xxx.tiff', 'path1/jobxxx/n2_file_001_yyy.tiff' ] program2_files = [ # Around 10 millions of items (parsed from a file), # with only around 10 thousand of unique items # IMO the items should be directly processed/matched while parsing the file 'path2/JXX/00001_n1_file_001_XXX_yyy_zz.mrc', 'path2/JXX/00001_n1_file_001_XXX_yyy_zz.mrc', 'path2/JXX/00002_n1_file_002_XXX_yyy_zz.mrc', 'path2/jXX/00003_n2_file_001_XXX_yyy_zz.mrc', 'path2/JZZ/00101_YYY_n1_file_001_xx_zzz.mrc', 'path2/JZZ/00102_YYY_n2_file_001_xx_zzz.mrc' ] ################
def get_parts(str): return os.path.splitext(os.path.basename(str))[0].split('_') program1_fileparts = dict(zip(program1_files, map(get_parts,program1_files))) file2_to_file1 = {} for file2 in program2_files: if file2 in file2_to_file1: continue # skip already processed file2 file2_parts = get_parts(file2) for file1 in program1_files: file1_parts = program1_fileparts[file1] # here I'm a little lost on what to do
Advertisement
Answer
Here’s a solution that uses a prefix tree to quickly match against the program 1 output files:
#!/usr/bin/env python3 import os from collections import defaultdict from typing import Any, Optional ##### DATA ##### program1_files = [ "path1/jobxxx/n1_file_001_xxx.tiff", "path1/jobxxx/n1_file_002_yyy_xxx.tiff", "path1/jobxxx/n2_file_001_yyy.tiff", ] program2_files = [ "path2/JXX/00001_n1_file_001_XXX_yyy_zz.mrc", "path2/JXX/00002_n1_file_002_XXX_yyy_zz.mrc", "path2/jXX/00003_n2_file_001_XXX_yyy_zz.mrc", "path2/JZZ/00101_YYY_n1_file_001_xx_zzz.mrc", "path2/JZZ/00102_YYY_n2_file_001_xx_zzz.mrc", ] ################ def get_parts(s: str) -> list[str]: return os.path.splitext(os.path.basename(s))[0].split("_") def build_prefix_tree(files: list[str]) -> dict[str, Any]: """Build a prefix tree of file path parts from program 1.""" # Node = dict[str, Node | tuple[list[str], str]] def insert(node: dict[str, Any], parts: list[str], path: str) -> None: if parts[0] not in node: # create leaf entry node[parts[0]] = (parts[1:], path) return child = node[parts[0]] if isinstance(child, tuple): # found a collision with an existing leaf: expand it to a full node child_parts, child_path = child child = {child_parts[0]: (child_parts[1:], child_path)} node[parts[0]] = child # recurse into next entry insert(child, parts[1:], path) prefix_tree: dict[str, Any] = {} for file in files: insert(prefix_tree, get_parts(file), file) return prefix_tree def recursive_match(node: dict[str, Any], parts: list[str]) -> Optional[str]: """Lookup a prefix match in `node`.""" if parts[0] not in node: # couldn't match parts return None child = node[parts[0]] if isinstance(child, tuple): # found matching file return child[1] return recursive_match(child, parts[1:]) tree = build_prefix_tree(program1_files) file1_to_file2: dict[str, list[str]] = defaultdict(list) file2_to_file1: dict[str, str] = {} for file2 in program2_files: if file2 in file2_to_file1: continue # skip already processed file2 file2_parts = get_parts(file2) for skip_parts in range(len(file2_parts)): # first try skipping no parts, then skip 1 part, then 2, and so on parts = file2_parts[skip_parts:] if (file1 := recursive_match(tree, parts)) is not None: file1_to_file2[file1].append(file2) file2_to_file1[file2] = file1 break else: # reached the end of the loop without breaking print(f"Warning: couldn't find match for {file2}")