Apr-22-2017, 08:33 PM
hi,
Can you tell why (NN NN NN NN ) IS NOT WORKING BELOW PROGRAM and (JJ) is only alone coming. actually, i required this pattern (jj* NN+)
PLEASE COPERATE ME TO CORRECT THIS CODE
Can you tell why (NN NN NN NN ) IS NOT WORKING BELOW PROGRAM and (JJ) is only alone coming. actually, i required this pattern (jj* NN+)
PLEASE COPERATE ME TO CORRECT THIS CODE
class extract(object): def __init__(self,regex=''): self.w=[] self.pos=[] self.sw=[] self.spos=[] self.regex=regex self.r=re.compile(regex) self.stl=[] self.tgl=[] def map_con(self,begin,end): self.index=[] f=1 for i in range(len(self.d)): mi=self.d[i][0] ma=self.d[i][1] if begin>=mi and begin<ma and f: self.index.append(i) f=0 elif end>ma and not f: self.index.append(i) elif end<ma and not f: #self.index.append(i) f=1 elif not f: self.index.append(i) break return self.index def n_gram_extract(self,line): #w=nltk.word_tokenize(line.lower()) self.d={} self.tag_dic={} line=line.replace('-',' - ') w1=nltk.word_tokenize(line) tag=nltk.pos_tag(w1) self.spos=[j for i,j in tag] self.sw=[i.lower() for i in w1] self.w=' '.join(self.sw) self.pos=' '.join(self.spos) pos=0 if not self.regex: print 'No regex found for spliting' return self.sw for i,j in enumerate(self.spos): l=pos+len(j) self.d[i]=[pos,l] pos=l+1 my_iter=self.r.finditer(self.pos) self.stl=[] self.tgl=[] count = 0 for match in my_iter: count += 1 ran=match.span() #print ran,count posi= self.map_con(ran[0],ran[1]) st='' tg='' for num in posi: st+=self.sw[num]+' ' tg+=self.spos[num]+' ' st=st.strip() tg=tg.strip() self.stl.append(st) self.tgl.append(tg) self.tag_dic[st]=tg #print st,tg #print count return self.tag_dic#self.stl#,tgl,self.tag_dic if __name__=='__main__': s="""Fast Breeder Test Reactor is a 40 MWt/13.2 MWe sodium cooled, loop type, mixed carbide-fuelled reactor. Its main aim is to gain experience in the design, construction and operation of fast reactors including sodium systems and to serve as an irradiation facility for development of fuel and structural materials for future fast reactors. """ o=extract('(JJ NN)|(JJJJNN)|(JJ NN NN)|(JJ NN NN NN)|NN|(NN NN)|(NN NN NN)|(NN NN NN NN)|(NN JJ)|(NN NN JJ)|(NN JJ NN NN)') ne=o.n_gram_extract(s) print(ne)
Output:{'development': 'NN', 'irradiation': 'NN', 'main aim': 'JJ NN', 'reactor': 'NN', 'sodium': 'NN', 'mwt/13.2': 'NN', 'experience': 'NN', 'facility': 'NN', 'construction': 'NN', 'design': 'NN', 'fast': 'JJ', 'mwe': 'NN', 'fuel': 'NN', 'loop type': 'JJ NN', 'operation': 'NN', 'structural': 'JJ', 'breeder test': 'JJ NN', 'systems': 'NNS'}