转换embl为fa脚本embl2fa.py-脚本08
embl2fa.py 一行命令行
python embl2fa.py X.embl > X.fa
embl2fa.py
import sys
import re
class EMBL:
def __init__(self, file_name):
self.file_name = file_name
self.records = self._parse_embl_file()
def _parse_embl_file(self):
records = []
with open(self.file_name, 'r') as f:
record_data = []
for line in f:
if line.startswith("//"):
if record_data:
records.append(self._create_record(record_data))
record_data = []
else:
record_data.append(line)
if record_data:
records.append(self._create_record(record_data))
return records
def _create_record(self, record_data):
record = EMBLRecord()
for line in record_data:
if line.startswith("ID"):
record.id = line.split()[1]
elif line.startswith("DE"):
record.description = line[2:].strip()
elif line.startswith("SQ"):
seq = "".join([l.strip() for l in record_data[record_data.index(line) + 1:] if not l.startswith("//")])
record.sequence = re.sub(r'\s+', '', seq)
return record
def get_record_count(self):
return len(self.records)
def get_record(self, index):
return self.records[index]
class EMBLRecord:
def __init__(self):
self.id = ""
self.description = ""
self.sequence = ""
def get_id(self):
return self.id
def get_description(self):
return self.description
def get_sequence(self):
return self.sequence
def get_rm_type(self):
return "RMType" # Placeholder: Add actual logic
def get_rm_sub_type(self):
return "" # Placeholder: Add actual logic
def get_rm_species_array(self):
return [] # Placeholder: Add actual logic
def get_rm_search_stages_array(self):
return [] # Placeholder: Add actual logic
def get_rm_buffer_stages_array(self):
return [] # Placeholder: Add actual logic
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python X.py X.embl > X.fa", file=sys.stderr)
sys.exit(1)
in_file = sys.argv[1]
try:
db = EMBL(file_name=in_file)
seq_count = db.get_record_count()
for i in range(seq_count):
record = db.get_record(i)
record_id = record.get_id()
record_type = f"#{record.get_rm_type()}"
if record.get_rm_sub_type():
record_type += f"/{record.get_rm_sub_type()}"
description = record.get_description()
species_list = " ".join([f"@{name}" for name in record.get_rm_species_array()])
stage_list = "[S:" + ",".join(record.get_rm_search_stages_array()) + "]"
stage_list = re.sub(r",\]", "]", stage_list)
# Write the sequence
seq = record.get_sequence()
print(f">{record_id}{record_type} {species_list} {stage_list} {description}")
seq = re.sub(r"(.{50})", r"\1\n", seq)
if not seq.endswith("\n"):
seq += "\n"
print(seq)
# Write the buffered sequence
stages = record.get_rm_buffer_stages_array()
stage_hash = {}
for stage in stages:
match = re.match(r"(\d+)\[(\d+)\-(\d+)\]", stage)
if match:
start, end = match.group(2), match.group(3)
stage_hash.setdefault(f"{start}-{end}", []).append(match.group(1))
elif re.match(r"(\d+)", stage):
stage_hash.setdefault("full", []).append(stage)
else:
print(f"Warning: Buffer stage {stage} not understood!", file=sys.stderr)
for buffer_seqs, stage_list in stage_hash.items():
seq = record.get_sequence()
stage_list_str = "[S:" + ",".join(stage_list) + "]"
if buffer_seqs == "full":
record_type = "#buffer"
else:
start, end = map(int, buffer_seqs.split("-"))
seq = seq[start - 1:end]
record_type = f"_{start}_{end}#buffer"
print(f">{record_id}{record_type} {species_list} {stage_list_str} {description}")
seq = re.sub(r"(.{50})", r"\1\n", seq)
if not seq.endswith("\n"):
seq += "\n"
print(seq)
except FileNotFoundError:
print(f"Error: File {in_file} not found!", file=sys.stderr)
sys.exit(1)
参考RepeatMasker/util/buildRMLibFromEMBL.pl