Output is an empty file

My code does not throw an error, it simply creates the files, but of which are empty. I tried it from the command line, and it works using the wildcard training_set_pssm/*.pssm path, but I must do it from the IDE because it is not printing the correct output anyway.
The input file is a set of checkpoint files that look like this:

From this file, which is a text file, saved as .pssm, essentially, I am extracting only the PROFILE side, which is on the right and NORMALIZING it at the same time… my code does not seem to do it correctly, and from the IDE it does not do it at all, so I am not sure what I need to modify in the script to do so at this point.

Here is the code:

#!/usr/bin/env python3
import sys
import os.path
from pathlib import Path


def pssm_list(infile):  # call list of file names and for dsspfile
    ''' Reads relevant lines from a pssm file and saves them to a list.
    Returns values of the 2 matrices (no header).'''
    with open(infile) as ofile:
        flist = ofile.readlines()[3:-6]  # list of each line of the file excluding first 3 & last 6 lines
        return flist

def lines_to_list(infile1):
    ''' Reads all lines from a file and saves them to a list containing the 'n' char. '''
    all_lines_list = []
    with open(infile1, 'r') as rfile:
        all_lines_list = rfile.readlines()
    return all_lines_list  # need to rstrip in a loop for using filenames.

def relevant_lines(infile2):
    '''Takes list (extracted from a .pssm file) and extracts the Sequence Profile Portion only.
    Returns a list of list where each element is one line of the sequence profile matrix. '''
    pssm_profile_list = pssm_list(infile2)  # contains all lines from the pssm file.
    profile_final_list = []  # for holding relevant fields of the lines
    for line in pssm_profile_list:
        #print(line)
        pssm_profile_list = line.split()[22:42]  # profile ranges from pos 22-42
        profile_final_list.append(pssm_profile_list)  # appending to final list of lists
    return profile_final_list  # list of lists


# # divide all values by 100
def write_normalized_profile(profile_final_list, ofile):
    '''Takes profile list of lists and outfile name as input. Writes each number that is in
    one of the sublists and devides it by 100. The number is converted to a string and added
    a tab and written to a file. After each sublist a newline character is written to the file.'''
    with open(ofile, "a") as wfile:
        for sublist in profile_final_list:
            #             print(sublist)
            for el in sublist:
                num = int(el) / 100
                numstring = str(num)
                wfile.write(numstring + 't')  # adding tab after each number
            wfile.write("n")  # adding newline at the end of each sublist.
            #print(sublist)
            #print(numstring)





if __name__ == '__main__':
    # infile = sys.argv[1]
    infile = ('/Users/name/Desktop/PDB/training_set_pssm/idlist/')  # the idlist to loop on
    #print(infile)
    # Call the function by looping through an id list+'.pssm' extension
    # name the outfile the same --> id list+'.profile'
    idlist = lines_to_list("/Users/name/Desktop/PDB/training_set_idlist")  # containing the id of the file but NOT the extension ".pssm"
    #print(idlist)
    for ids in idlist:
        #print(ids)
        part2 = ids.rstrip() + '.pssm'  # removing newlinecharacter, adding necessary extension
        #print(part2)
        if os.path.isfile(infile) == True:  # does this file exist
            ofile = ids.rstrip() + '.profile'  # outfile for each id with correct extension
            #print(ofile)
            profile_list = relevant_lines(infile)
            #print(profile_list)
            write_normalized_profile(profile_list, ofile)
            #print(write_normalized_profile)
            #print(profile_list)

        else:
            print("Error file: " + infile + " not found.")

JavaScript
​x
 
#!/usr/bin/env python3
import sys
import os.path
from pathlib import Path
​
​
def pssm_list(infile):  # call list of file names and for dsspfile
    ''' Reads relevant lines from a pssm file and saves them to a list.
    Returns values of the 2 matrices (no header).'''
    with open(infile) as ofile:
        flist = ofile.readlines()[3:-6]  # list of each line of the file excluding first 3 & last 6 lines
        return flist
​
def lines_to_list(infile1):
    ''' Reads all lines from a file and saves them to a list containing the 'n' char. '''
    all_lines_list = []
    with open(infile1, 'r') as rfile:
        all_lines_list = rfile.readlines()
    return all_lines_list  # need to rstrip in a loop for using filenames.
​
def relevant_lines(infile2):
    '''Takes list (extracted from a .pssm file) and extracts the Sequence Profile Portion only.
    Returns a list of list where each element is one line of the sequence profile matrix. '''
    pssm_profile_list = pssm_list(infile2)  # contains all lines from the pssm file.
    profile_final_list = []  # for holding relevant fields of the lines
    for line in pssm_profile_list:
        #print(line)
        pssm_profile_list = line.split()[22:42]  # profile ranges from pos 22-42
        profile_final_list.append(pssm_profile_list)  # appending to final list of lists
    return profile_final_list  # list of lists
​
​
# # divide all values by 100
def write_normalized_profile(profile_final_list, ofile):
    '''Takes profile list of lists and outfile name as input. Writes each number that is in
    one of the sublists and devides it by 100. The number is converted to a string and added
    a tab and written to a file. After each sublist a newline character is written to the file.'''
    with open(ofile, "a") as wfile:
        for sublist in profile_final_list:
            #             print(sublist)
            for el in sublist:
                num = int(el) / 100
                numstring = str(num)
                wfile.write(numstring + 't')  # adding tab after each number
            wfile.write("n")  # adding newline at the end of each sublist.
            #print(sublist)
            #print(numstring)
​
​
​
​
​
if __name__ == '__main__':
    # infile = sys.argv[1]
    infile = ('/Users/name/Desktop/PDB/training_set_pssm/idlist/')  # the idlist to loop on
    #print(infile)
    # Call the function by looping through an id list+'.pssm' extension
    # name the outfile the same --> id list+'.profile'
    idlist = lines_to_list("/Users/name/Desktop/PDB/training_set_idlist")  # containing the id of the file but NOT the extension ".pssm"
    #print(idlist)
    for ids in idlist:
        #print(ids)
        part2 = ids.rstrip() + '.pssm'  # removing newlinecharacter, adding necessary extension
        #print(part2)
        if os.path.isfile(infile) == True:  # does this file exist
            ofile = ids.rstrip() + '.profile'  # outfile for each id with correct extension
            #print(ofile)
            profile_list = relevant_lines(infile)
            #print(profile_list)
            write_normalized_profile(profile_list, ofile)
            #print(write_normalized_profile)
            #print(profile_list)
​
        else:
            print("Error file: " + infile + " not found.")
​

Answer

First and foremost lets fix your paths, you imported from pathlib import Path but never used it.

lets declare infile = Path('/Users/name/Desktop/PDB/training_set_pssm/idlist/'), we now have some helpfull functions we can use for finding problems.

try out some of these to make sure you are searching in the right place.

#this will write out the absolute filepath usefull to check if it is correct
infile.absolute()

#this tells you if this path exists
infile.exists()

#this tells you if this is a file
infile.is_file()

JavaScript
 
#this will write out the absolute filepath usefull to check if it is correct
infile.absolute()
​
#this tells you if this path exists
infile.exists()
​
#this tells you if this is a file
infile.is_file()
​

let’s start at the beginning I’ll try and explain what is happening in your code line by line.

if __name__ == '__main__':
    # i don't really know what this infile is, is it a file containing
    # d1s7za_.fasta.pssm 
    # d1s98a_.fasta.pssm 
    # d1s99a_.fasta.pssm 

    #or a directory containing files named
    #d1s7za_.fasta.pssm 
    #d1s98a_.fasta.pssm 
    #d1s99a_.fasta.pssm 
    #...
    infile = Path('/Users/name/Desktop/PDB/training_set_pssm/idlist')

    # this returns a list of string presumably in the form of
    # d1ciya2.fastan
    # d1ciya3.fastan
    # d1cq3a_.fastan
    idlist = lines_to_list("/Users/name/Desktop/PDB/training_set_idlist")

   # loop over that list
   for ids in idlist:
        # strips the 'n' from the id and adds '.pssm'
        # you now have something like 'd1d0qa_.fasta.pssm'
        # you never use this
        part2 = ids.rstrip() + '.pssm'
        
        # was 'if os.path.isfile(infile) == True:' but should be :
        if infile.is_file():

            # strips the 'n' from the id and adds '.profile'
            # you now have something like 'd1d0qa_.fasta.profile'
            ofile = ids.rstrip() + '.profile'

            # here is where it becomes a bit weird
            # in relevant_lines you say:
            # Takes list (extracted from a .pssm file) and extracts the Sequence Profile Portion only.
            # is infile a .pssm file?
            # is this correct?
            profile_list = relevant_lines(infile)

            # this seems fine, it writes the normalized data to ofile.
            # ofile will be something like 'd1d0qa_.fasta.profile'
            write_normalized_profile(profile_list, ofile)

JavaScript
 
if __name__ == '__main__':
    # i don't really know what this infile is, is it a file containing
    # d1s7za_.fasta.pssm 
    # d1s98a_.fasta.pssm 
    # d1s99a_.fasta.pssm 
​
    #or a directory containing files named
    #d1s7za_.fasta.pssm 
    #d1s98a_.fasta.pssm 
    #d1s99a_.fasta.pssm 
    #...
    infile = Path('/Users/name/Desktop/PDB/training_set_pssm/idlist')
​
    # this returns a list of string presumably in the form of
    # d1ciya2.fastan
    # d1ciya3.fastan
    # d1cq3a_.fastan
    idlist = lines_to_list("/Users/name/Desktop/PDB/training_set_idlist")
​
   # loop over that list
   for ids in idlist:
        # strips the 'n' from the id and adds '.pssm'
        # you now have something like 'd1d0qa_.fasta.pssm'
        # you never use this
        part2 = ids.rstrip() + '.pssm'
        
        # was 'if os.path.isfile(infile) == True:' but should be :
        if infile.is_file():
​
            # strips the 'n' from the id and adds '.profile'
            # you now have something like 'd1d0qa_.fasta.profile'
            ofile = ids.rstrip() + '.profile'
​
            # here is where it becomes a bit weird
            # in relevant_lines you say:
            # Takes list (extracted from a .pssm file) and extracts the Sequence Profile Portion only.
            # is infile a .pssm file?
            # is this correct?
            profile_list = relevant_lines(infile)
​
            # this seems fine, it writes the normalized data to ofile.
            # ofile will be something like 'd1d0qa_.fasta.profile'
            write_normalized_profile(profile_list, ofile)
​

solution:

if __name__ == '__main__':
    pssm_directory = Path('/Users/name/Desktop/PDB/training_set_pssm/idlist/') #the directory

    idlist = lines_to_list("/Users/name/Desktop/PDB/training_set_idlist")

    for ids in idlist:

        infile = pssm_directory.joinpath(ids.rstrip() + '.pssm') #generate filename from id
        if infile.is_file(): #check if filename exists

            ofile = ids.rstrip() + '.profile'

            profile_list = relevant_lines(infile)

            write_normalized_profile(profile_list, ofile)

JavaScript
 
if __name__ == '__main__':
    pssm_directory = Path('/Users/name/Desktop/PDB/training_set_pssm/idlist/') #the directory
​
    idlist = lines_to_list("/Users/name/Desktop/PDB/training_set_idlist")
​
    for ids in idlist:
​
        infile = pssm_directory.joinpath(ids.rstrip() + '.pssm') #generate filename from id
        if infile.is_file(): #check if filename exists
​
            ofile = ids.rstrip() + '.profile'
​
            profile_list = relevant_lines(infile)
​
            write_normalized_profile(profile_list, ofile)
​

Advertisement

Answer