Jan-17-2019, 04:57 AM
In this post, I have allowed both counting newline, and eliminating newline
Note this code can be reused for different sequences.
It could be easily modified to get the sequence from fasta
The results are in ../data/sequence_slicedir/
I need to get some sleep so won't see any comments for several hours.
Note this code can be reused for different sequences.
It could be easily modified to get the sequence from fasta
The results are in ../data/sequence_slicedir/
import os from pathlib import Path class SliceGenome: def __init__(self): # Make sure starting directory is source directory os.chdir(os.path.abspath(os.path.dirname(__file__))) self.homepath = Path('.') self.datapath = self.homepath / '../data' self.datapath.mkdir(exist_ok=True) self.sequencedir = self.datapath / 'sequences' self.sequencedir.mkdir(exist_ok=True) self.sequence_slicedir = self.datapath / 'sequence_slicedir' self.sequence_slicedir.mkdir(exist_ok=True) self.sequence = None def read_genome(self, filename, remove_lf=True): ''' Initialization created a data directory one level above src code directory, with a sequences directory under that. Place all sequence files in this directory ''' # want 8 bit ascii encoding, and skip 1st line with filename.open(encoding='ascii') as fp: self.sequence = '' if remove_lf: for line in fp: self.sequence = self.sequence + line.strip() else: self.sequence = fp.read() def get_savefilename(self, filename, start_idx, end_idx): return self.sequence_slicedir / f"{filename.name.split('.')[0]}{end_idx}-{start_idx}.txt" def get_slice(self, fn, start_idx, end_idx, remove_lf=True): filename = self.sequencedir / fn self.read_genome(filename, remove_lf) #account for zero base start_idx = start_idx -1 end_idx = end_idx - 1 idx1 = start_idx + (end_idx - start_idx) slice = self.sequence[start_idx:idx1] savefilename = self.get_savefilename(filename, start_idx, end_idx) with savefilename.open('w') as fp: fp.write(slice) print(f'\nlength: {len(slice)}, slice: \n{slice}') def main(): # change id for different sequence, and start and end indexes # slice will have name of sequence file, + indexes and found in ../data/sequence_slicedir sequence_filename = 'AvinosumDSM180.txt' start_idx = 94443 end_idx = 95255 sg = SliceGenome() # without LF removal sg.get_slice(sequence_filename, start_idx, end_idx, remove_lf=False) # with lf removal sg.get_slice(sequence_filename, start_idx, end_idx, remove_lf=True) if __name__ == '__main__': main()with newline
Output:length: 812, slice:
CTGATCGCTCTTGGCGCCGACGGCTAATACCCATCCTCGCCACTATCCTGGTAGCCGT
CGTTGTGTTCGTCTGGACGCTGAATCTGACCGACTGGCTGATGCGCCCACCGCCAGCCTCCACGGTCGAG
TATCTGCCACATACCGACGGCGCCGAGAGGCTGGTTCCGCCACCCGTCAGCGAAGCCCTGAGCCTGGAAC
GCTTCCAGGCCGCCCGGCGCGCGTGCGATGGTCCTTGTGTCACCGACTTCGGCACACCGCTGGGTCGCGC
CAACGGCGTCGAGGCCCGCTCCAACTGTGCCTCGCTCTGCGTGCGGCTCGAATCGAGCTTTGTCGATCCC
GACTCTGGCCGGATCTGGATCGCCCGCTCGGGCGAGCACCCCGAGCCGCTGGAATATTCCGGTCTGGCCT
ATCAGTGCGTCGAGTATGCGCGGCGCTGGTGGATCCAGACGCTCGGCCTGACCTTCGGCGACGTGCCGAC
GGCCGCCGACATCCTGCGCCTCACCGAGGGCCGGCGTCTGTCCGATCAGGCGGTCATTCCGCTCGGTCGG
TCGCTCAACGGACACGCCCGCCGCGCCCCCGAACGCGGTGATCTGGTGATCTATGCCGCCGATCCCAACG
ACCCGGAGTGGCGCGCCGGGCATGTCGCCGTTGTGGTCGACACCGATCTCGAACAGGGCTGGGTCGCACT
GGCCGAGCAGAACTACGACAACCGTCCCTGGAGCGATCCGGAGTACCATGCCAGGCGTATCCGAATCGTG
CGTATAGGCGAACGCTATAGCCTGCTCGACGTCGCCCAAGATC
without newlineOutput:length: 812, slice:
TGCGCGGTGATGTAAGGGTGCATACCGCGAGTCCGATCGCCGCCGCGTGGCTGCTCGCCGTCGGTCTGGTGGCCCACGCCGAAGAGCCGCCGACCGTCGCTCTGACGGTTCCGGCGGCCGCGCTGCTCCCTGACGGCGCACTCGGTGAGAGCATCGTCCGTGGTCGGCGCTATCTGTCGGATACGCCGGCTCAGTTGCCCGACTTCGTTGGCAATGGACTGGCCTGCCGACACTGCCATCCCGGCCGAGACGGGGAGGTCGGCACCGAAGCCAATGCGGCCCCCTTCGTCGGCGTCGTCGGACGCTTTCCGCAGTACAGCGCCCGACATGGCCGCCTCATCACGCTCGAACAGCGCATCGGCGATTGTTTCGAGCGCAGTCTCAACGGTCGAGCGCTCGCGCTCGATCACCCCGCCCTGATCGACATGCTGGCCTACATGAGCTGGCTGTCGCAGGGCGTGCCCGTGGGCGCTGTCGTAGCGGGACATGGCATCCCGACGCTGACGCTGGAGCGCGAACCGGATGGGGTGCATGGGGAGGCGCTCTACCAGGCCAGGTGTCTGGCCTGTCATGGAGCCGACGGGAGCGGCACGCTGGACGCCGATGGACGCTATCTTTTCCCGCCTCTGTGGGGGCCGCGTTCGTTCAACACCGGCGCGGGGATGAACCGTCAGGCCACGGCCGCCGGGTTCATCAAGCACAAGATGCCGTTAGGCGCCGATGACTCGCTCAGCGATGAAGAGGCGTGGGACGTGGCCGGTTTCGTGCTCACGCATCCGCGTCCACTGTTCCAGGAGCCGACGGGTGACTGA
This can be easily broken into 70 character lines.I need to get some sleep so won't see any comments for several hours.