I have two distinct programs that processed the same input files. During the processing, both programs generated files with names based on their original counterpart. The possible changes, without considering the modified paths and file extensions, would be, in regex terms, as follow:
- Program1 can append
(_[[:alnum:]]+)*
to the original filenames. - Program2 can append
(_[[:alnum:]]+)*
and prepend([[:alnum:]]+_)*
to the original filenames.
Now, given a list of several millions of file paths from Program2, I need to match each one of them to an unique file path from Program1.
What would be a sensible way to do it?
Here’s where I’m stuck:
JavaScript
x
26
26
1
#!/usr/bin/env python3
2
import os
3
4
##### DATA #####
5
6
program1_files = [
7
# Around 10 thousand of unique items (parsed from a file).
8
'path1/jobxxx/n1_file_001_xxx.tiff',
9
'path1/jobxxx/n1_file_002_yyy_xxx.tiff',
10
'path1/jobxxx/n2_file_001_yyy.tiff'
11
]
12
13
program2_files = [
14
# Around 10 millions of items (parsed from a file),
15
# with only around 10 thousand of unique items
16
# IMO the items should be directly processed/matched while parsing the file
17
'path2/JXX/00001_n1_file_001_XXX_yyy_zz.mrc',
18
'path2/JXX/00001_n1_file_001_XXX_yyy_zz.mrc',
19
'path2/JXX/00002_n1_file_002_XXX_yyy_zz.mrc',
20
'path2/jXX/00003_n2_file_001_XXX_yyy_zz.mrc',
21
'path2/JZZ/00101_YYY_n1_file_001_xx_zzz.mrc',
22
'path2/JZZ/00102_YYY_n2_file_001_xx_zzz.mrc'
23
]
24
25
################
26
JavaScript
1
19
19
1
def get_parts(str):
2
return os.path.splitext(os.path.basename(str))[0].split('_')
3
4
program1_fileparts = dict(zip(program1_files, map(get_parts,program1_files)))
5
6
file2_to_file1 = {}
7
8
for file2 in program2_files:
9
10
if file2 in file2_to_file1: continue # skip already processed file2
11
12
file2_parts = get_parts(file2)
13
14
for file1 in program1_files:
15
16
file1_parts = program1_fileparts[file1]
17
18
# here I'm a little lost on what to do
19
Advertisement
Answer
Here’s a solution that uses a prefix tree to quickly match against the program 1 output files:
JavaScript
1
81
81
1
#!/usr/bin/env python3
2
import os
3
from collections import defaultdict
4
from typing import Any, Optional
5
6
##### DATA #####
7
8
program1_files = [
9
"path1/jobxxx/n1_file_001_xxx.tiff",
10
"path1/jobxxx/n1_file_002_yyy_xxx.tiff",
11
"path1/jobxxx/n2_file_001_yyy.tiff",
12
]
13
14
program2_files = [
15
"path2/JXX/00001_n1_file_001_XXX_yyy_zz.mrc",
16
"path2/JXX/00002_n1_file_002_XXX_yyy_zz.mrc",
17
"path2/jXX/00003_n2_file_001_XXX_yyy_zz.mrc",
18
"path2/JZZ/00101_YYY_n1_file_001_xx_zzz.mrc",
19
"path2/JZZ/00102_YYY_n2_file_001_xx_zzz.mrc",
20
]
21
22
################
23
24
def get_parts(s: str) -> list[str]:
25
return os.path.splitext(os.path.basename(s))[0].split("_")
26
27
def build_prefix_tree(files: list[str]) -> dict[str, Any]:
28
"""Build a prefix tree of file path parts from program 1."""
29
# Node = dict[str, Node | tuple[list[str], str]]
30
31
def insert(node: dict[str, Any], parts: list[str], path: str) -> None:
32
if parts[0] not in node:
33
# create leaf entry
34
node[parts[0]] = (parts[1:], path)
35
return
36
child = node[parts[0]]
37
if isinstance(child, tuple):
38
# found a collision with an existing leaf: expand it to a full node
39
child_parts, child_path = child
40
child = {child_parts[0]: (child_parts[1:], child_path)}
41
node[parts[0]] = child
42
# recurse into next entry
43
insert(child, parts[1:], path)
44
45
prefix_tree: dict[str, Any] = {}
46
for file in files:
47
insert(prefix_tree, get_parts(file), file)
48
return prefix_tree
49
50
def recursive_match(node: dict[str, Any], parts: list[str]) -> Optional[str]:
51
"""Lookup a prefix match in `node`."""
52
if parts[0] not in node:
53
# couldn't match parts
54
return None
55
child = node[parts[0]]
56
if isinstance(child, tuple):
57
# found matching file
58
return child[1]
59
return recursive_match(child, parts[1:])
60
61
62
tree = build_prefix_tree(program1_files)
63
64
file1_to_file2: dict[str, list[str]] = defaultdict(list)
65
file2_to_file1: dict[str, str] = {}
66
for file2 in program2_files:
67
if file2 in file2_to_file1:
68
continue # skip already processed file2
69
70
file2_parts = get_parts(file2)
71
for skip_parts in range(len(file2_parts)):
72
# first try skipping no parts, then skip 1 part, then 2, and so on
73
parts = file2_parts[skip_parts:]
74
if (file1 := recursive_match(tree, parts)) is not None:
75
file1_to_file2[file1].append(file2)
76
file2_to_file1[file2] = file1
77
break
78
else:
79
# reached the end of the loop without breaking
80
print(f"Warning: couldn't find match for {file2}")
81