#!/usr/bin/env python3

from collections import OrderedDict
import re 

# ==============================
# 入出力ファイル設定（スクリプト先頭）
# ==============================
# DNA（CDS 由来）入力/出力
#IN_CDS = "Styela-clava-CDS_cds_from_genomic.fna"
#OUT_CDS = "900_out_Styela-clava-CDS_cds_from_genomic.fna.251103nc.txt"

# タンパク質（AA）入力/出力
#IN_AA = "Styela-clava-CDS_protein.faa"
#OUT_AA = "900_out_Styela-clava-CDS_protein.251103nc.faa"

# --- 小さな例題を使う場合（公開用サンプル、5レコード） ---
# 下の4行のコメントを外して利用してください。
IN_CDS = "example_cds_5.fna"
IN_AA = "example_proteins_5.faa"

OUT_CDS = "out_" + IN_CDS
OUT_AA = "out_" + IN_AA

def readFasta_dict(filename):
    seq_dict = OrderedDict()
    with open(filename, "r") as infile:
        name = None  # 変数 name を初期化
        for line in infile:
            line = line.strip()
            if not line:
                continue  # 空行をスキップ
            if line.startswith(">"):
                name = line
                seq_dict[name] = ""
            elif name:
                seq_dict[name] += line
            else:
                raise ValueError("ERROR: ヘッダーの前に配列があります。")
    return seq_dict

# DNA 正しいファイル名に修正
seq_dict_DNA = readFasta_dict(IN_CDS)

dic_name_change = {}
out_DNA = open(OUT_CDS, "w")
for name, seq in seq_dict_DNA.items():
    #print("name", name)
    # 正規表現でヘッダーを解析: seqid, gene, protein, protein_id を抽出
    matchAA = re.search(r">([^ ]+).*?\[gene=([^\]]+)\].*?\[protein=([^\]]+)\].*?\[protein_id=([^\]]+)\]", name)
    if not matchAA:
        print("WARNING: couldn't parse header:", name)
        continue
    seqid = matchAA.group(1)
    geneID = matchAA.group(2)
    descripton = matchAA.group(3)

    protID = matchAA.group(4)
    protID = re.sub("_", "", protID)

    descriptonTMP = re.sub(" ", "-", descripton)

    newName = f">{protID} gene:{geneID} transcript:{protID} description:{descriptonTMP} "

    dic_name_change[protID] = newName

    out_DNA.write(newName + "\n")
    out_DNA.write(seq + "\n")
out_DNA.close()

#for protID, nameNew in dic_name_change.items():
#    print("dic_name_change", protID, "=>", nameNew)
#exit()

# AA 正しいファイル名に修正
seq_dict_AA = readFasta_dict(IN_AA)
out_AA = open(OUT_AA, "w")
for name, seq in seq_dict_AA.items():
    print("name", name)
    matcheBB = re.search(r">([^ ]+) ", name)
    if not matcheBB:
        print("WARNING: couldn't parse header:", name)
        continue
    protID = matcheBB.group(1)
    protID = re.sub("_", "", protID)
    print("protID", protID)
    #exit()
    if protID in dic_name_change:
        newName = dic_name_change[protID]
        out_AA.write(newName + "\n")
        out_AA.write(seq + "\n")
    else:
        print("WARNING: couldn't find matching name for:", name)    
        exit()
out_AA.close()
