获取启动子序列
获取启动子序列
- 获取物种基因组序列genome.fa文件
- 获取物种注释信息gff文件
输入参数:
positionFile, genomeFa, outfa
(pos坐标文件,包含四列 chrom start end strand)
使用Python脚本
Python |
---|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67 | #!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@File : Promoter.py
@Time : 2021/08/15 20:16:07
@Author : biolxy
@Version : 1.0
@Contact : biolxy@aliyun.com
@Desc : None
@Usage : python Promoter.py pos.Lee.txt /home/lixy/soybean/other/glyma.Lee.gnm1.BXNC.genome_main.fna out.Lee.fa
'''
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import sys
class Promoter(object):
@staticmethod
def get_promoter_by_genome(positionFile, genomeFa, outfa, length=2000):
"""positionFile
Chr20 35557820 35562522 +
length: 2000
"""
dict1 = {}
pos_list = Promoter.get_pos_list(positionFile)
for seq_record in SeqIO.parse(genomeFa, "fasta"):
seqid = str(seq_record.id)
for tmp_l in pos_list:
if seqid == tmp_l[0]:
seq = seq_record.seq
s = int(tmp_l[1]) - length
e = int(tmp_l[1])
if '-' == tmp_l[3]:
s = int(tmp_l[2])
e = int(tmp_l[2]) + length
# print(s, e)
seq = seq[s:e]
if '-' == tmp_l[3]:
seq = seq.reverse_complement() # 反向互补
n_seqid = seqid + "_" + str(tmp_l[1]) + ":" + str(tmp_l[2]) + "_" + str(tmp_l[3])
dict1[n_seqid] = seq
records = []
for seqid in dict1:
seq = dict1[seqid]
rec1 = SeqRecord(seq, id=seqid, description="")
records.append(rec1)
SeqIO.write(records, outfa, "fasta")
@staticmethod
def get_pos_list(infile):
list_ = []
with open(infile, 'r') as ff:
for line in ff:
line = line.strip()
tag = line.split("\t") # chrom start end strand
list_.append(tag)
return list_
if __name__ == "__main__":
positionFile, genomeFa, outfa = sys.argv[1], sys.argv[2], sys.argv[3]
Promoter.get_promoter_by_genome(positionFile, genomeFa, outfa)
|