Oct-04-2017, 07:09 PM
I have several files were column A is a tagname made of letters and numbers (e.g. Rob_0001,...) and column B is a value for column A (e.g. Rob_0001 \t 89). I am trying to merge them but not all tagnames are equal in all files (e.g. some files have no Rob_0005).
I have split it into two scripts: one that generates files with gaps so that all column As are the same and now I am going to try to merge. But it seems like I am making the scripts unnecessarily complicated with 100 lines of code! ... is there a better way? Thanks. Here is the code that introduces gaps so that all tagneames in all files are similar
import os #this imports the operating system module that lets me look at files and directories
filenames = os.listdir('c:/python27/C/U') #this is the list of files in that directory
for filename in filenames:#here it loops through each file in the list
infile = open(os.path.join('c:/python27/C/U', filename), 'r') #this opens the file and makes it readable
outputfilename=filename+"_gaps.txt"#this creates a new file name for the modified files that I am about to create
outfile= open(os.path.join('c:/python27/C/U', outputfilename), 'w') #this opens these new files and makes them writable
locusnumber = 0
linenumber =0 #this is the beginning of a loop; it needs to be set back to zero for each file
newlocusnumber=1
for line in infile:#this loops through each line in the file,
if linenumber==0:#first line is the header so just write it
outfile.write(line)
elif locusnumber<10:#for lines 1-9
elements=line.split('\t')#this splits lines into different items
elementnumber=len(elements)
if elements[0]==('Cabther_A000'+ str(locusnumber)):#this is why only for lines 1-9, so that the total digit nubmer equals 4
outfile.write(line)
else:
word=elements[0]
number=int(word[9:])
difference=number-locusnumber
locusnumber = locusnumber + difference
outfile.write('\n'*difference + line)
elif locusnumber<100:#lines 10-99
elements=line.split('\t')#this splits lines into different items
if elements[0]==('Cabther_A00'+ str(locusnumber)):#total digit number equals 4
outfile.write(line)
else:
word=elements[0]
number=int(word[9:])
difference=number-locusnumber
locusnumber = locusnumber + difference
outfile.write('\n'*difference + line)
elif locusnumber<1000:
elements=line.split('\t')#this splits lines into different items
if elements[0]==('Cabther_A0'+ str(locusnumber)):
outfile.write(line)
else:
word=elements[0]
number=int(word[9:])
difference=number-locusnumber
locusnumber = locusnumber + difference
outfile.write('\n'*difference + line)
elif locusnumber<2274:
elements=line.split('\t')#this splits lines into different items
if elements[0]==('Cabther_A'+ str(locusnumber)):
outfile.write(line)
else:
word=elements[0]
number=int(word[9:])
difference=number-locusnumber
locusnumber = locusnumber + difference
outfile.write('\n'*difference + line)
elif locusnumber>=2274:
if newlocusnumber<0:
elements=line.split('\t')#this splits lines into different items
if elements[0]==('Cabther_B000'+ str(newlocusnumber)):
outfile.write(line)
newlocusnumber=newlocusnumber+1
else:
word=elements[0]
number=int(word[9:])
difference=number-newlocusnumber
newlocusnumber = newlocusnumber + difference
outfile.write('\n'*difference + line)
newlocusnumber=newlocusnumber+1
elif newlocusnumber<100:
elements=line.split('\t')#this splits all headers into different items
if elements[0]==('Cabther_B00'+ str(newlocusnumber)):
outfile.write(line)
newlocusnumber=newlocusnumber+1
else:
word=elements[0]
number=int(word[9:])
difference=number-locusnumber
newlocusnumber = newlocusnumber + difference
outfile.write('\n'*difference + line)
newlocusnumber=newlocusnumber+1
elif newlocusnumber<800:
elements=line.split('\t')#this splits all lines into different items
if elements[0]==('Cabther_B0'+ str(newlocusnumber)):
outfile.write(line)
newlocusnumber=newlocusnumber+1
else:
word=elements[0]
number=int(word[9:])
difference=number-newlocusnumber
newlocusnumber = newlocusnumber + difference
outfile.write('\n'*difference + line)
newlocusnumber=newlocusnumber+1
linenumber = linenumber +1
locusnumber = locusnumber +1#this makes sure that the next line in the loop will not be the first one
infile.close()
outfile.close()
I have split it into two scripts: one that generates files with gaps so that all column As are the same and now I am going to try to merge. But it seems like I am making the scripts unnecessarily complicated with 100 lines of code! ... is there a better way? Thanks. Here is the code that introduces gaps so that all tagneames in all files are similar
import os #this imports the operating system module that lets me look at files and directories
filenames = os.listdir('c:/python27/C/U') #this is the list of files in that directory
for filename in filenames:#here it loops through each file in the list
infile = open(os.path.join('c:/python27/C/U', filename), 'r') #this opens the file and makes it readable
outputfilename=filename+"_gaps.txt"#this creates a new file name for the modified files that I am about to create
outfile= open(os.path.join('c:/python27/C/U', outputfilename), 'w') #this opens these new files and makes them writable
locusnumber = 0
linenumber =0 #this is the beginning of a loop; it needs to be set back to zero for each file
newlocusnumber=1
for line in infile:#this loops through each line in the file,
if linenumber==0:#first line is the header so just write it
outfile.write(line)
elif locusnumber<10:#for lines 1-9
elements=line.split('\t')#this splits lines into different items
elementnumber=len(elements)
if elements[0]==('Cabther_A000'+ str(locusnumber)):#this is why only for lines 1-9, so that the total digit nubmer equals 4
outfile.write(line)
else:
word=elements[0]
number=int(word[9:])
difference=number-locusnumber
locusnumber = locusnumber + difference
outfile.write('\n'*difference + line)
elif locusnumber<100:#lines 10-99
elements=line.split('\t')#this splits lines into different items
if elements[0]==('Cabther_A00'+ str(locusnumber)):#total digit number equals 4
outfile.write(line)
else:
word=elements[0]
number=int(word[9:])
difference=number-locusnumber
locusnumber = locusnumber + difference
outfile.write('\n'*difference + line)
elif locusnumber<1000:
elements=line.split('\t')#this splits lines into different items
if elements[0]==('Cabther_A0'+ str(locusnumber)):
outfile.write(line)
else:
word=elements[0]
number=int(word[9:])
difference=number-locusnumber
locusnumber = locusnumber + difference
outfile.write('\n'*difference + line)
elif locusnumber<2274:
elements=line.split('\t')#this splits lines into different items
if elements[0]==('Cabther_A'+ str(locusnumber)):
outfile.write(line)
else:
word=elements[0]
number=int(word[9:])
difference=number-locusnumber
locusnumber = locusnumber + difference
outfile.write('\n'*difference + line)
elif locusnumber>=2274:
if newlocusnumber<0:
elements=line.split('\t')#this splits lines into different items
if elements[0]==('Cabther_B000'+ str(newlocusnumber)):
outfile.write(line)
newlocusnumber=newlocusnumber+1
else:
word=elements[0]
number=int(word[9:])
difference=number-newlocusnumber
newlocusnumber = newlocusnumber + difference
outfile.write('\n'*difference + line)
newlocusnumber=newlocusnumber+1
elif newlocusnumber<100:
elements=line.split('\t')#this splits all headers into different items
if elements[0]==('Cabther_B00'+ str(newlocusnumber)):
outfile.write(line)
newlocusnumber=newlocusnumber+1
else:
word=elements[0]
number=int(word[9:])
difference=number-locusnumber
newlocusnumber = newlocusnumber + difference
outfile.write('\n'*difference + line)
newlocusnumber=newlocusnumber+1
elif newlocusnumber<800:
elements=line.split('\t')#this splits all lines into different items
if elements[0]==('Cabther_B0'+ str(newlocusnumber)):
outfile.write(line)
newlocusnumber=newlocusnumber+1
else:
word=elements[0]
number=int(word[9:])
difference=number-newlocusnumber
newlocusnumber = newlocusnumber + difference
outfile.write('\n'*difference + line)
newlocusnumber=newlocusnumber+1
linenumber = linenumber +1
locusnumber = locusnumber +1#this makes sure that the next line in the loop will not be the first one
infile.close()
outfile.close()