|
# Written and tested in Python 2.7.2 and bipython
#Extract_annotation_using_BlastSubjectIds(fast).py
# 本脚本用来从 blast 结果(m6 fomat)中,根据第二列subject id 从原始database中抽取相关注释及参考序列。
import os
from Bio import SeqIO
while True:
Parameters=raw_input("Enter parameters 1.[.blast] and 2.[.fasta]sepeated by Space: ")
try:
X1=Parameters.strip().split(' ')[0]
X2=Parameters.strip().split(' ')[1]
if not os.path.exists(X1):
print 'The fasta file can not be found in the working directory!'
continue
break
except:
print 'Errors: invalid input format or not enough input !'
continue
a={}
j=0
for line in open(X1,'r'):
a[line.rstrip().split('t')[1]]=line.rstrip()
print len(a)
f1=open(X1.replace('.blast','')+'.annotated.blast','w')
f2=open(X1.replace('.blast','')+'.ref.fasta','w')
i=0
for record in SeqIO.parse(X2,'fasta'):
i+=1
if i%1000000==0:
print i,'seqs searched!'
try:
j+=1
f1.write(a[str(record.id)]+'t'+str(record.description)+'n')
f2.write('n'.join(['>'+str(record.description),str(record.seq)])+'n')
except KeyError:
continue
print i,'sequences in',X2
print j,'annotatins written!'
print 'OK, finished!'
Archiver|手机版|科学网 ( 京ICP备07017567号-12 )
GMT+8, 2024-5-12 12:33
Powered by ScienceNet.cn
Copyright © 2007- 中国科学报社