Here is the final program. It Sorts and Selects in one module.
It is written so that you can run it from the command line using:
It is written so that you can run it from the command line using:
python SelsectAndSortFasta infilename outfilename min_seq_sizeOr, you can import it into another program and call from within similar to:
# Add following import at top of your program import SelectAndSortFasta # In your initialization routine, add: # If a class: self.sasf = SelectAndSortFasta.SelectAndSortFasta() # If just a function: sasf = SelectAndSortFasta.SelectAndSortFasta() # Then when you want to run a file sasf.sort_fasta_file(infile, outfile, minlen)Here's the code:
from operator import itemgetter import sys class SelectAndSortFasta: """ Sort in decreasing size order """ def __init__(self): self.headers = [] self.infile = None self.outfile = None self.minlen = 0 def ExtractHeader(self): """ Reads all headers, saves starting file position, header, and size in self.headers :return: None """ seqlen = 0 this_header = [] firstseq = True f = open(self.infile, 'r') f.seek(0, 2) file_size = f.tell() f.seek(0) while True: fptr = f.tell() if fptr == file_size: break line = None line = f.readline() line = line.strip() # skip empty lines if len(line) > 0: # print(f'line len: {len(line)}') if line.startswith('>'): if firstseq: firstseq = False else: this_header.append(seqlen) self.headers.append(this_header) seqlen = 0 this_header = [] this_header.append(fptr) this_header.append(line) else: seqlen += len(line) this_header.append(seqlen) self.headers.append(this_header) f.close() # This is the sort routine, sorts on column 2 (size) of self.headers, in reverse order self.headers.sort(key=itemgetter(2), reverse=True) self.show_header() def show_header(self): """ display self.headers list :return: None """ for item in self.headers: print(f'File ptr: {item[0]}, Size: {item[2]}, Header: {item[1]}') def write_outfile(self): """ Reads self.headers (which is sorted in reverse size order, seeks to that record in the inout file (from file pointer which i in column 0 of self.headers) and writes the output file :return: """ with open(self.infile) as f, open(self.outfile, 'w') as fo: for item in self.headers: # Ignore entry if too small if item[2] < minlen: continue f.seek(item[0], 0) fo.write(f.readline()) while True: buf = f.readline() if buf.startswith('>'): break fo.write(buf) def sort_fasta_file(self, infile, outfile, minlen): """ Launch pad for sort and select program :param infile: Name of input fasta file :param outfile: Name of output fasta file :param minlen: Minimun size of output record. (smaller inut records are ignored) :return: None """ self.infile = infile self.outfile = outfile self.minlen = minlen self.ExtractHeader() self.write_outfile() if __name__ == '__main__': # Test routine srtf = SelectAndSortFasta() numargs = len(sys.argv) if numargs > 1: infile = sys.argv[1] outfile = sys.argv[2] minlen = sys.argv[3] else: infile = 'data/fasta/AINZ01/AINZ01.1.fsa_nt' outfile = 'data/fasta/AINZ01/AINZ01sorted.1.fsa_nt' minlen = 1000 srtf.sort_fasta_file(infile, outfile, minlen)You may want to supress the printout on line 57 (self.show_header())