How to process files based on their date in Python?

I have two sort of files, xml files and txt files. The files have a date in their name. If the date of the xml file matches the date of a txt file I want to open the txt file do some processing and write the output to a list. After that I want to change the xml file. Multiple xml files can have the same date but the txt file is unique so this means that more then 1 xml file can be linked with a txt file.

Right now I have a problem. my to_csv list contains data of both 20200907 and 20201025. I don’t want it to work like that. I want my to_csv list just do one file (and thus one date) at a time.

output_xml = r"c:desktopenergyXML_Output"
output_txt = r"c:desktopenergyTXT_Output"

xml_name = os.listdir(output_xml )
txt_name = os.listdir(output_txt)
txt_name = [x.replace('-', '') for x in txt_name] #remove the - in the filenames

# Extract the date from the xml and txt files. 
xml_dates = []
for file in xml_name:
    find = re.search("_(.d+)-", file).group(1)
    xml_dates.append(find)

txt_dates = []
for file in txt_name:
    find = re.search("MM(.+?)AB", file).group(1)
    txt_dates.append(find)

#THIS IS SOME REPRODUCABLE OUTPUT FROM WHAT IS RECEIVED FROM ABOVE SNIPPET.
xml_dates = ['20200907', '20200908', '20201025', '20201025', '20201025', '20201025']
txt_dates = ['20200907', '20201025']

to_csv = []

for date_xml in xml_dates:
    for date_txt in txt_dates:
        if date_xml == date_txt:

              match_txt = [s for s in txt_name if date_txt in s]  # matching txt file  
              match_xml = [s for s in xml_name if date_xml in s]  # matching xml file

              match_txt_temp = match_txt[0]
              match_txt_score = [match_txt_temp[:6]+'-'+match_txt_temp[6:8]+'-'+match_txt_temp[8:10]+'-'+match_txt_temp[10:12]+match_txt_temp[12:]]

              with open(output_txt + "/" + match_txt_score[0], "r") as outer:
                reader = csv.reader(outer, delimiter="t")  

                for row in reader:
                    read = [row for row in reader if row]
                    for row in read:
  
                        energy_level = row[20]

                        if energy_level > 250:
                            to_csv.append(row)
                            
print(to_csv)

JavaScript
​x
 
output_xml = r"c:desktopenergyXML_Output"
output_txt = r"c:desktopenergyTXT_Output"
​
xml_name = os.listdir(output_xml )
txt_name = os.listdir(output_txt)
txt_name = [x.replace('-', '') for x in txt_name] #remove the - in the filenames
​
# Extract the date from the xml and txt files. 
xml_dates = []
for file in xml_name:
    find = re.search("_(.d+)-", file).group(1)
    xml_dates.append(find)
​
txt_dates = []
for file in txt_name:
    find = re.search("MM(.+?)AB", file).group(1)
    txt_dates.append(find)
​
#THIS IS SOME REPRODUCABLE OUTPUT FROM WHAT IS RECEIVED FROM ABOVE SNIPPET.
xml_dates = ['20200907', '20200908', '20201025', '20201025', '20201025', '20201025']
txt_dates = ['20200907', '20201025']
​
to_csv = []
​
for date_xml in xml_dates:
    for date_txt in txt_dates:
        if date_xml == date_txt:
​
              match_txt = [s for s in txt_name if date_txt in s]  # matching txt file  
              match_xml = [s for s in xml_name if date_xml in s]  # matching xml file
​
              match_txt_temp = match_txt[0]
              match_txt_score = [match_txt_temp[:6]+'-'+match_txt_temp[6:8]+'-'+match_txt_temp[8:10]+'-'+match_txt_temp[10:12]+match_txt_temp[12:]]
​
              with open(output_txt + "/" + match_txt_score[0], "r") as outer:
                reader = csv.reader(outer, delimiter="t")  
​
                for row in reader:
                    read = [row for row in reader if row]
                    for row in read:
  
                        energy_level = row[20]
​
                        if energy_level > 250:
                            to_csv.append(row)
                            
print(to_csv)
​

Current output:

[['1', '2', '3', '20200907', '4', '5'], 
['1', '2', '3', '20200907', '4', '5'], 
['1', '2', '3', '20200907', '4', '5'], 
['1', '2', '3', '20201025, '4', '5'], 
['1', '2', '3', '20201025, '4', '5']]

JavaScript
 
[['1', '2', '3', '20200907', '4', '5'], 
['1', '2', '3', '20200907', '4', '5'], 
['1', '2', '3', '20200907', '4', '5'], 
['1', '2', '3', '20201025, '4', '5'], 
['1', '2', '3', '20201025, '4', '5']]
​

Desired output:

[[['1', '2', '3', '20200907', '4', '5'], 
['1', '2', '3', '20200907', '4', '5'], 
['1', '2', '3', '20200907', '4', '5']], 
['1', '2', '3', '20201025, '4', '5'], 
['1', '2', '3', '20201025, '4', '5']]

JavaScript
 
[[['1', '2', '3', '20200907', '4', '5'], 
['1', '2', '3', '20200907', '4', '5'], 
['1', '2', '3', '20200907', '4', '5']], 
['1', '2', '3', '20201025, '4', '5'], 
['1', '2', '3', '20201025, '4', '5']]
​

Answer

You said that you have only one txt file by date and only want to process xml files if they are linked to a txt file. That means that one single loop over txt_dates is enough:

...
for date_txt in txt_dates:
    date_xml = date_txt

    match_txt = [s for s in txt_name if date_txt in s]  # the matching txt file  
    match_xml = [s for s in xml_name if date_xml in s]  # possible matching xml files
    if len(match_xml) == 0:   # no matching xml files
        continue

    match_txt_temp = match_txt[0]
    match_txt_score = [match_txt_temp[:6]+'-'+match_txt_temp[6:8]+'-'
                       +match_txt_temp[8:10]+'-'+match_txt_temp[10:12]
                       +match_txt_temp[12:]]

    # prepare a new list for that date
    curr = list()

    with open(output_txt + "/" + match_txt_score[0], "r") as outer:
        reader = csv.reader(outer, delimiter="t")  

        for row in reader:
            read = [row for row in reader if row]
            for row in read:
                energy_level = row[20]
                if energy_level > 250:
                    curr.append(row)

    if len(curr) > 0:    # if the current date list is not empty append it
        to_csv.append(curr)
                        
print(to_csv)

JavaScript
 
...
for date_txt in txt_dates:
    date_xml = date_txt
​
    match_txt = [s for s in txt_name if date_txt in s]  # the matching txt file  
    match_xml = [s for s in xml_name if date_xml in s]  # possible matching xml files
    if len(match_xml) == 0:   # no matching xml files
        continue
​
    match_txt_temp = match_txt[0]
    match_txt_score = [match_txt_temp[:6]+'-'+match_txt_temp[6:8]+'-'
                       +match_txt_temp[8:10]+'-'+match_txt_temp[10:12]
                       +match_txt_temp[12:]]
​
    # prepare a new list for that date
    curr = list()
​
    with open(output_txt + "/" + match_txt_score[0], "r") as outer:
        reader = csv.reader(outer, delimiter="t")  
​
        for row in reader:
            read = [row for row in reader if row]
            for row in read:
                energy_level = row[20]
                if energy_level > 250:
                    curr.append(row)
​
    if len(curr) > 0:    # if the current date list is not empty append it
        to_csv.append(curr)
                        
print(to_csv)
​

BEWARE: as what you have provided is not a reproducible example I could not test the above code and typos are possible…

Advertisement

Answer