Skip to content
Advertisement

Matching modified filenames

I have two distinct programs that processed the same input files. During the processing, both programs generated files with names based on their original counterpart. The possible changes, without considering the modified paths and file extensions, would be, in regex terms, as follow:

  • Program1 can append (_[[:alnum:]]+)* to the original filenames.
  • Program2 can append (_[[:alnum:]]+)* and prepend ([[:alnum:]]+_)* to the original filenames.

Now, given a list of several millions of file paths from Program2, I need to match each one of them to an unique file path from Program1.

What would be a sensible way to do it?

Here’s where I’m stuck:

#!/usr/bin/env python3
import os

##### DATA #####

program1_files = [
    # Around 10 thousand of unique items (parsed from a file).
    'path1/jobxxx/n1_file_001_xxx.tiff',
    'path1/jobxxx/n1_file_002_yyy_xxx.tiff',
    'path1/jobxxx/n2_file_001_yyy.tiff'
]

program2_files = [
    # Around 10 millions of items (parsed from a file),
    # with only around 10 thousand of unique items
    # IMO the items should be directly processed/matched while parsing the file
    'path2/JXX/00001_n1_file_001_XXX_yyy_zz.mrc',
    'path2/JXX/00001_n1_file_001_XXX_yyy_zz.mrc',
    'path2/JXX/00002_n1_file_002_XXX_yyy_zz.mrc',
    'path2/jXX/00003_n2_file_001_XXX_yyy_zz.mrc',
    'path2/JZZ/00101_YYY_n1_file_001_xx_zzz.mrc',
    'path2/JZZ/00102_YYY_n2_file_001_xx_zzz.mrc'
]

################
def get_parts(str):
    return os.path.splitext(os.path.basename(str))[0].split('_')

program1_fileparts = dict(zip(program1_files, map(get_parts,program1_files)))

file2_to_file1 = {}

for file2 in program2_files:

    if file2 in file2_to_file1: continue   # skip already processed file2

    file2_parts = get_parts(file2)

    for file1 in program1_files:

        file1_parts = program1_fileparts[file1]

        # here I'm a little lost on what to do

Advertisement

Answer

Here’s a solution that uses a prefix tree to quickly match against the program 1 output files:

#!/usr/bin/env python3
import os
from collections import defaultdict
from typing import Any, Optional

##### DATA #####

program1_files = [
    "path1/jobxxx/n1_file_001_xxx.tiff",
    "path1/jobxxx/n1_file_002_yyy_xxx.tiff",
    "path1/jobxxx/n2_file_001_yyy.tiff",
]

program2_files = [
    "path2/JXX/00001_n1_file_001_XXX_yyy_zz.mrc",
    "path2/JXX/00002_n1_file_002_XXX_yyy_zz.mrc",
    "path2/jXX/00003_n2_file_001_XXX_yyy_zz.mrc",
    "path2/JZZ/00101_YYY_n1_file_001_xx_zzz.mrc",
    "path2/JZZ/00102_YYY_n2_file_001_xx_zzz.mrc",
]

################

def get_parts(s: str) -> list[str]:
    return os.path.splitext(os.path.basename(s))[0].split("_")

def build_prefix_tree(files: list[str]) -> dict[str, Any]:
    """Build a prefix tree of file path parts from program 1."""
    # Node = dict[str, Node | tuple[list[str], str]]

    def insert(node: dict[str, Any], parts: list[str], path: str) -> None:
        if parts[0] not in node:
            # create leaf entry
            node[parts[0]] = (parts[1:], path)
            return
        child = node[parts[0]]
        if isinstance(child, tuple):
            # found a collision with an existing leaf: expand it to a full node
            child_parts, child_path = child
            child = {child_parts[0]: (child_parts[1:], child_path)}
            node[parts[0]] = child
        # recurse into next entry
        insert(child, parts[1:], path)

    prefix_tree: dict[str, Any] = {}
    for file in files:
        insert(prefix_tree, get_parts(file), file)
    return prefix_tree

def recursive_match(node: dict[str, Any], parts: list[str]) -> Optional[str]:
    """Lookup a prefix match in `node`."""
    if parts[0] not in node:
        # couldn't match parts
        return None
    child = node[parts[0]]
    if isinstance(child, tuple):
        # found matching file
        return child[1]
    return recursive_match(child, parts[1:])


tree = build_prefix_tree(program1_files)

file1_to_file2: dict[str, list[str]] = defaultdict(list)
file2_to_file1: dict[str, str] = {}
for file2 in program2_files:
    if file2 in file2_to_file1:
        continue  # skip already processed file2

    file2_parts = get_parts(file2)
    for skip_parts in range(len(file2_parts)):
        # first try skipping no parts, then skip 1 part, then 2, and so on
        parts = file2_parts[skip_parts:]
        if (file1 := recursive_match(tree, parts)) is not None:
            file1_to_file2[file1].append(file2)
            file2_to_file1[file2] = file1
            break
    else:
        # reached the end of the loop without breaking
        print(f"Warning: couldn't find match for {file2}")
User contributions licensed under: CC BY-SA
6 People found this is helpful
Advertisement