跳转至

2021

获取启动子序列

获取启动子序列

  1. 获取物种基因组序列genome.fa文件
  2. 获取物种注释信息gff文件

输入参数: positionFile, genomeFa, outfa (pos坐标文件,包含四列 chrom start end strand)

使用Python脚本

Python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@File    :   Promoter.py
@Time    :   2021/08/15 20:16:07
@Author  :   biolxy 
@Version :   1.0
@Contact :   biolxy@aliyun.com
@Desc    :   None
@Usage   :   python Promoter.py pos.Lee.txt /home/lixy/soybean/other/glyma.Lee.gnm1.BXNC.genome_main.fna out.Lee.fa
'''
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import sys


class Promoter(object):
    @staticmethod
    def get_promoter_by_genome(positionFile, genomeFa, outfa, length=2000):
        """positionFile
        Chr20   35557820    35562522    +

        length: 2000
        """
        dict1 = {}
        pos_list = Promoter.get_pos_list(positionFile)
        for seq_record in SeqIO.parse(genomeFa, "fasta"):
            seqid = str(seq_record.id)
            for tmp_l in pos_list:
                if seqid == tmp_l[0]:
                    seq = seq_record.seq
                    s = int(tmp_l[1]) - length
                    e = int(tmp_l[1])

                    if '-' == tmp_l[3]:
                        s = int(tmp_l[2])
                        e = int(tmp_l[2]) + length
                    # print(s, e)
                    seq = seq[s:e]
                    if '-' == tmp_l[3]:
                        seq = seq.reverse_complement() # 反向互补
                    n_seqid = seqid + "_" + str(tmp_l[1]) + ":" + str(tmp_l[2]) + "_" + str(tmp_l[3])
                    dict1[n_seqid] = seq

        records = []
        for seqid in dict1:
            seq = dict1[seqid]
            rec1 = SeqRecord(seq, id=seqid, description="")
            records.append(rec1)
        SeqIO.write(records, outfa, "fasta")


    @staticmethod
    def get_pos_list(infile):
        list_ = []
        with open(infile, 'r') as ff:
            for line in ff:
                line = line.strip()
                tag = line.split("\t") # chrom start end strand
                list_.append(tag)
        return list_


if __name__ == "__main__":
    positionFile, genomeFa, outfa = sys.argv[1], sys.argv[2], sys.argv[3]
    Promoter.get_promoter_by_genome(positionFile, genomeFa, outfa)