Source code for phages2050.features.io.fasta

import os

import pandas as pd

from Bio.SeqIO.FastaIO import FastaIterator
from Bio.SeqRecord import SeqRecord


[docs]class FastaReader: """ Universal class for reading FASTA files with genome or protein sequence or multi-FASTA with chunks of sequences Example: fname = 'NC_001604.fasta' fr = FastaReader(fname) kmers_sequence = fr.get_sequence() ks_df = fr.to_df() """ def __init__(self, fasta_file_path: str): self.fasta_file_path = fasta_file_path self.fasta_name = os.path.basename(self.fasta_file_path) @staticmethod def _fasta_reader(filename: str) -> SeqRecord: """ FASTA file reader as iterator """ with open(filename) as handle: for record in FastaIterator(handle): yield record @staticmethod def _normalize(entry: SeqRecord) -> str: """ Each of the sequence is normalized into uppercase format without blank chars at the end """ return str(entry.seq).upper().strip()
[docs] def get_sequence(self) -> str: """ Final genome or protein sequence string after normalization """ sequence: str = "" for entry in self._fasta_reader(self.fasta_file_path): sequence += f"{self._normalize(entry)} " return sequence.strip()
[docs] def to_df(self) -> pd.DataFrame: """ Return pandas DataFrame with k-mers sequence format what is expected by KMersTransformer """ return pd.DataFrame(data=[self.get_sequence()], columns=["sequence"])