My code does not throw an error, it simply creates the files, but of which are empty. I tried it from the command line, and it works using the wildcard training_set_pssm/*.pssm path, but I must do it from the IDE because it is not printing the correct output anyway.
The input file is a set of checkpoint files that look like this:
From this file, which is a text file, saved as .pssm, essentially, I am extracting only the PROFILE side, which is on the right and NORMALIZING it at the same time… my code does not seem to do it correctly, and from the IDE it does not do it at all, so I am not sure what I need to modify in the script to do so at this point.
Here is the code:
#!/usr/bin/env python3 import sys import os.path from pathlib import Path def pssm_list(infile): # call list of file names and for dsspfile ''' Reads relevant lines from a pssm file and saves them to a list. Returns values of the 2 matrices (no header).''' with open(infile) as ofile: flist = ofile.readlines()[3:-6] # list of each line of the file excluding first 3 & last 6 lines return flist def lines_to_list(infile1): ''' Reads all lines from a file and saves them to a list containing the 'n' char. ''' all_lines_list = [] with open(infile1, 'r') as rfile: all_lines_list = rfile.readlines() return all_lines_list # need to rstrip in a loop for using filenames. def relevant_lines(infile2): '''Takes list (extracted from a .pssm file) and extracts the Sequence Profile Portion only. Returns a list of list where each element is one line of the sequence profile matrix. ''' pssm_profile_list = pssm_list(infile2) # contains all lines from the pssm file. profile_final_list = [] # for holding relevant fields of the lines for line in pssm_profile_list: #print(line) pssm_profile_list = line.split()[22:42] # profile ranges from pos 22-42 profile_final_list.append(pssm_profile_list) # appending to final list of lists return profile_final_list # list of lists # # divide all values by 100 def write_normalized_profile(profile_final_list, ofile): '''Takes profile list of lists and outfile name as input. Writes each number that is in one of the sublists and devides it by 100. The number is converted to a string and added a tab and written to a file. After each sublist a newline character is written to the file.''' with open(ofile, "a") as wfile: for sublist in profile_final_list: # print(sublist) for el in sublist: num = int(el) / 100 numstring = str(num) wfile.write(numstring + 't') # adding tab after each number wfile.write("n") # adding newline at the end of each sublist. #print(sublist) #print(numstring) if __name__ == '__main__': # infile = sys.argv[1] infile = ('/Users/name/Desktop/PDB/training_set_pssm/idlist/') # the idlist to loop on #print(infile) # Call the function by looping through an id list+'.pssm' extension # name the outfile the same --> id list+'.profile' idlist = lines_to_list("/Users/name/Desktop/PDB/training_set_idlist") # containing the id of the file but NOT the extension ".pssm" #print(idlist) for ids in idlist: #print(ids) part2 = ids.rstrip() + '.pssm' # removing newlinecharacter, adding necessary extension #print(part2) if os.path.isfile(infile) == True: # does this file exist ofile = ids.rstrip() + '.profile' # outfile for each id with correct extension #print(ofile) profile_list = relevant_lines(infile) #print(profile_list) write_normalized_profile(profile_list, ofile) #print(write_normalized_profile) #print(profile_list) else: print("Error file: " + infile + " not found.")
Advertisement
Answer
First and foremost lets fix your paths, you imported from pathlib import Path
but never used it.
lets declare infile = Path('/Users/name/Desktop/PDB/training_set_pssm/idlist/')
, we now have some helpfull functions we can use for finding problems.
try out some of these to make sure you are searching in the right place.
#this will write out the absolute filepath usefull to check if it is correct infile.absolute() #this tells you if this path exists infile.exists() #this tells you if this is a file infile.is_file()
let’s start at the beginning I’ll try and explain what is happening in your code line by line.
if __name__ == '__main__': # i don't really know what this infile is, is it a file containing # d1s7za_.fasta.pssm # d1s98a_.fasta.pssm # d1s99a_.fasta.pssm #or a directory containing files named #d1s7za_.fasta.pssm #d1s98a_.fasta.pssm #d1s99a_.fasta.pssm #... infile = Path('/Users/name/Desktop/PDB/training_set_pssm/idlist') # this returns a list of string presumably in the form of # d1ciya2.fastan # d1ciya3.fastan # d1cq3a_.fastan idlist = lines_to_list("/Users/name/Desktop/PDB/training_set_idlist") # loop over that list for ids in idlist: # strips the 'n' from the id and adds '.pssm' # you now have something like 'd1d0qa_.fasta.pssm' # you never use this part2 = ids.rstrip() + '.pssm' # was 'if os.path.isfile(infile) == True:' but should be : if infile.is_file(): # strips the 'n' from the id and adds '.profile' # you now have something like 'd1d0qa_.fasta.profile' ofile = ids.rstrip() + '.profile' # here is where it becomes a bit weird # in relevant_lines you say: # Takes list (extracted from a .pssm file) and extracts the Sequence Profile Portion only. # is infile a .pssm file? # is this correct? profile_list = relevant_lines(infile) # this seems fine, it writes the normalized data to ofile. # ofile will be something like 'd1d0qa_.fasta.profile' write_normalized_profile(profile_list, ofile)
solution:
if __name__ == '__main__': pssm_directory = Path('/Users/name/Desktop/PDB/training_set_pssm/idlist/') #the directory idlist = lines_to_list("/Users/name/Desktop/PDB/training_set_idlist") for ids in idlist: infile = pssm_directory.joinpath(ids.rstrip() + '.pssm') #generate filename from id if infile.is_file(): #check if filename exists ofile = ids.rstrip() + '.profile' profile_list = relevant_lines(infile) write_normalized_profile(profile_list, ofile)