Source code for phages2050.features.transformers.kmers

from typing import Set, Union, List

import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from pandas.core.series import Series

from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText

from pandarallel import pandarallel


# Parallelization has a cost, so parallelization is efficient only
# if the amount of calculation to parallelize is high enough.
# For very little amount of data, using parallelization is not always worth it.
pandarallel.initialize()


[docs]class KMersTransformer(BaseEstimator, TransformerMixin): """ K-mer transformer is responsible to extract set of words which are subsequences of length (6 by default) contained within a biological sequence Each of the word is called k-mer and are composed of nucleotides (i.e. A, T, G, and C) Example: fname = 'NC_001604.fasta' fr = FastaReader(fname) sample = fr.to_df() kmt = KMersTransformer() kmt.transform(sample) """ def __init__(self, size: int = 6): self.accepted_chars: Set[str] = {"A", "C", "T", "G"} self.size: int = size def _extract_kmers_from_sequence(self, sequence: str) -> str: """ K-mer transformer with sliding window method, where each k-mer has size of 6 (by default) A sliding window is used to scan the entire sequence, if the k-mer contains unsupported character then the whole k-mer is ignored (not included in final string) Method return a string with k-mers separated by space what is expected as input for embedding """ return " ".join( [ sequence[x : x + self.size] for x in range(len(sequence) - self.size + 1) if not set(sequence[x : x + self.size]) - self.accepted_chars ] )
[docs] def transform(self, df: pd.DataFrame) -> Series: """ Execute k-mer transformer on each DNA sequence and return it as Series with k-mers strings """ # sequence column is expected assert list(df.columns) == ["sequence"] return df.sequence.parallel_apply(self._extract_kmers_from_sequence)
[docs]class GenomeAvgTransformer(TransformerMixin, BaseEstimator): """ Average k-mers to represent Bacteriophage with word embedding Most Word2vec or fastText pre-trained models allow to get numerical representations of individual words but not of entire documents With this class it can average each k-mer of a DNA so that the generated Bacteriophage vector is actually a centroid of all k-mers in feature space """ def __init__(self, gensim_model: Union[FastText, Word2Vec]): """ It support Word2Vec as well as fastText embedding model """ self.gensim_model: Union[FastText, Word2Vec] = gensim_model self.columns = [ f"feature_{index}" for index in range(self.gensim_model.vector_size) ]
[docs] def average_word_vectors(self, words: List[str], vocabulary: Set) -> np.array: """ Return fixed-length numeric vector for each DNA sequence """ # Filter only words supported by the vocabulary supported_words: list = [word for word in words if word in vocabulary] if supported_words: # Return average fixed-length numeric vector # including all the k-mers in the sequence feature_vector: np.array = np.mean( self.gensim_model[supported_words], axis=0, dtype="float64" ) else: # Return fixed-length numeric vector with zeros # if the k-mers (words) weren't in the vocabulary feature_vector: np.array = np.zeros( (self.gensim_model.vector_size,), dtype="float64" ) return feature_vector
[docs] def averaged_word_vectorizer(self, column_with_kmers_seqs) -> np.array: """ Execute DNA averaged vector transformer on each k-mer sequence and return as array of numeric values """ # Unique set of words vocabulary: set = set(self.gensim_model.wv.index2word) features: list = [ self.average_word_vectors( # Split k-mer sequence with spaces into a list with k-mers (words) words=sentence.split(), vocabulary=vocabulary, ) for sentence in column_with_kmers_seqs ] return np.array(features)
[docs] def transform(self, column_with_kmers_seqs: Series) -> pd.DataFrame: """ Execute DNA averaged vector transformer on each k-mer sequence and return it Pandas DataFrame with fixed-length numeric vector space """ return pd.DataFrame( data=self.averaged_word_vectorizer(column_with_kmers_seqs), columns=self.columns, )