当前位置: 首页 > article >正文

转换embl为fa脚本embl2fa.py-脚本08

embl2fa.py 一行命令行

 python embl2fa.py X.embl > X.fa

embl2fa.py 

import sys
import re


class EMBL:
    def __init__(self, file_name):
        self.file_name = file_name
        self.records = self._parse_embl_file()

    def _parse_embl_file(self):
        records = []
        with open(self.file_name, 'r') as f:
            record_data = []
            for line in f:
                if line.startswith("//"):
                    if record_data:
                        records.append(self._create_record(record_data))
                        record_data = []
                else:
                    record_data.append(line)
            if record_data:
                records.append(self._create_record(record_data))
        return records

    def _create_record(self, record_data):
        record = EMBLRecord()
        for line in record_data:
            if line.startswith("ID"):
                record.id = line.split()[1]
            elif line.startswith("DE"):
                record.description = line[2:].strip()
            elif line.startswith("SQ"):
                seq = "".join([l.strip() for l in record_data[record_data.index(line) + 1:] if not l.startswith("//")])
                record.sequence = re.sub(r'\s+', '', seq)
        return record

    def get_record_count(self):
        return len(self.records)

    def get_record(self, index):
        return self.records[index]


class EMBLRecord:
    def __init__(self):
        self.id = ""
        self.description = ""
        self.sequence = ""

    def get_id(self):
        return self.id

    def get_description(self):
        return self.description

    def get_sequence(self):
        return self.sequence

    def get_rm_type(self):
        return "RMType"  # Placeholder: Add actual logic

    def get_rm_sub_type(self):
        return ""  # Placeholder: Add actual logic

    def get_rm_species_array(self):
        return []  # Placeholder: Add actual logic

    def get_rm_search_stages_array(self):
        return []  # Placeholder: Add actual logic

    def get_rm_buffer_stages_array(self):
        return []  # Placeholder: Add actual logic


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python X.py X.embl > X.fa", file=sys.stderr)
        sys.exit(1)

    in_file = sys.argv[1]

    try:
        db = EMBL(file_name=in_file)

        seq_count = db.get_record_count()

        for i in range(seq_count):
            record = db.get_record(i)
            record_id = record.get_id()
            record_type = f"#{record.get_rm_type()}"
            if record.get_rm_sub_type():
                record_type += f"/{record.get_rm_sub_type()}"
            description = record.get_description()

            species_list = " ".join([f"@{name}" for name in record.get_rm_species_array()])
            stage_list = "[S:" + ",".join(record.get_rm_search_stages_array()) + "]"
            stage_list = re.sub(r",\]", "]", stage_list)

            # Write the sequence
            seq = record.get_sequence()
            print(f">{record_id}{record_type} {species_list} {stage_list} {description}")
            seq = re.sub(r"(.{50})", r"\1\n", seq)
            if not seq.endswith("\n"):
                seq += "\n"
            print(seq)

            # Write the buffered sequence
            stages = record.get_rm_buffer_stages_array()
            stage_hash = {}
            for stage in stages:
                match = re.match(r"(\d+)\[(\d+)\-(\d+)\]", stage)
                if match:
                    start, end = match.group(2), match.group(3)
                    stage_hash.setdefault(f"{start}-{end}", []).append(match.group(1))
                elif re.match(r"(\d+)", stage):
                    stage_hash.setdefault("full", []).append(stage)
                else:
                    print(f"Warning: Buffer stage {stage} not understood!", file=sys.stderr)

            for buffer_seqs, stage_list in stage_hash.items():
                seq = record.get_sequence()
                stage_list_str = "[S:" + ",".join(stage_list) + "]"
                if buffer_seqs == "full":
                    record_type = "#buffer"
                else:
                    start, end = map(int, buffer_seqs.split("-"))
                    seq = seq[start - 1:end]
                    record_type = f"_{start}_{end}#buffer"

                print(f">{record_id}{record_type} {species_list} {stage_list_str} {description}")
                seq = re.sub(r"(.{50})", r"\1\n", seq)
                if not seq.endswith("\n"):
                    seq += "\n"
                print(seq)

    except FileNotFoundError:
        print(f"Error: File {in_file} not found!", file=sys.stderr)
        sys.exit(1)

参考RepeatMasker/util/buildRMLibFromEMBL.pl


http://www.kler.cn/a/466017.html

相关文章:

  • 【调试记录】在CARLA中插入可以播放视频的组件
  • MySQL(六)MySQL 案例
  • 【大模型】7 天 AI 大模型学习
  • win32汇编环境,在窗口程序中画简单图形
  • ELK 使用教程采集系统日志 Elasticsearch、Logstash、Kibana
  • k8s基础(3)—Kubernetes-Deployment
  • 智能手机租赁系统全新模式改变消费习惯与商家盈利路径
  • 社区信息化管理系统(源码+文档+部署+讲解)
  • 数据结构--顺序表(详解)
  • windows文件名的最大长度
  • 批量上传文件
  • 微服务实战——购物车模块实战
  • 机场安全项目|基于改进 YOLOv8 的机场飞鸟实时目标检测方法
  • 使用java语言,自定义redistemplate
  • day26-lvm逻辑卷管理
  • 微机——绪论
  • 亚信安全2025年第1期《勒索家族和勒索事件监控报告》
  • 使用MPTCP+BBR进行数据传输,让网络又快又稳
  • Win32汇编学习笔记03.RadAsm和补丁
  • PTA数据结构作业一
  • AI新闻自动化:使用Tavily Search API构建AI新闻总结助手
  • 大循环引起CPU负载过高
  • MySQL 06 章——多表查询
  • 解决ssh和git秘钥认证失败问题
  • 管理者管理上班摸鱼
  • wujie无界微前端框架初使用