Source code for phages2050.classifiers.proteins.structural_protein

import os
import base64
import joblib
from pathlib import Path
from io import BytesIO
from zipfile import ZipFile
from typing import Dict

import pandas as pd
import requests

from fake_useragent import UserAgent


[docs]class BacteriophageStructuralProteinManager:
    """
    Manager class is responsible to download and unzip
    pre-trained model and label encoder for Bacteriophage
    Structural Protein classification
    """

    BSP_MODEL_URL = base64.b64decode(
        "aHR0cHM6Ly9kZWVwcGV0cmkuYWkvc3RhdGljL3BoYW"
        "dlczIwNTAvYnNwX21vZGVsXzIxLjA4LjIwMjAuemlw"
    )
    BSP_LABELS_URL = base64.b64decode(
        "aHR0cHM6Ly9kZWVwcGV0cmkuYWkvc3RhdGljL3BoYWdlczIw"
        "NTAvYnNwX2xhYmVsX2VuY29kZXJfMjEuMDguMjAyMC56aXA="
    )
    STATUS_CODE_200 = 200

    def __init__(self, root_dir: str = "bsp_model"):
        self.root_dir = root_dir
        self.model_dir = Path(self.root_dir) / "model"
        self.label_encoder_dir = Path(self.root_dir) / "label_encoder"

        if not os.path.exists(self.root_dir):
            os.makedirs(self.model_dir)
            os.makedirs(self.label_encoder_dir)

    @staticmethod
    def _get_path(dir_path: str):
        """
        Return Path with unzipped file
        """

        files = os.listdir(dir_path)
        if files and len(files) == 1:
            file_path = Path(dir_path) / files[0]

            return file_path
        else:
            raise Exception("BSP model wasn't in the archive")

    @staticmethod
    def _get_headers() -> Dict:
        """
        Return header dict with random User-Agent to support request
        and to avoid being blocked by the server
        """

        ua = UserAgent()
        ua.update()

        return {"User-Agent": ua.random}

[docs]    def download_model(self):
        """
        Download pre-trained model and label encoder and unzip them into directories

        This procedure should be executed once and the result
        loaded by BacteriophageStructuralProteinClassifier class instance
        """

        headers = self._get_headers()

        # Download classifier model
        with requests.get(self.BSP_MODEL_URL, headers=headers, timeout=10) as response:
            assert response.status_code == self.STATUS_CODE_200

            with ZipFile(BytesIO(response.content)) as zip_file:
                zip_file.extractall(self.model_dir)

        model_path = self._get_path(self.model_dir)

        # Download labels encoder
        with requests.get(self.BSP_LABELS_URL, headers=headers, timeout=10) as response:
            assert response.status_code == self.STATUS_CODE_200

            with ZipFile(BytesIO(response.content)) as zip_file:
                zip_file.extractall(self.label_encoder_dir)

        label_encoder_path = self._get_path(self.label_encoder_dir)

        return {
            "model_path": model_path,
            "label_encoder_path": label_encoder_path,
        }


[docs]class BacteriophageStructuralProteinClassifier:
    """
    Classifier is responsible to load and execute pre-trained model and label encoder
    for phage structural protein prediction. This model support 11 proteins classes:
    - HTJ
    - basplate
    - collar
    - major_capsid
    - major_tail
    - minor_capsid
    - minor_tail
    - other
    - portal
    - tail_fiber
    - tail_shaft

    The model accuracy is 96.92% on training and 95.64% on validation sets after
    10-fold cross-validation. Model was trained with 11 000 samples.
    """

    SUPPORTED_COLUMNS = ["predicted_index", "predicted_class", "accuracy"]
    FEATURE_SPACE = 1024

    def __init__(self, model_path: str, label_encoder_path: str):
        """
        Check if model and label encoder directory exists
        if yes, then load the model into memory

        This method should be executed once
        """

        self.model_dir = model_path
        if not os.path.exists(self.model_dir):
            raise Exception("BSP model wasn't downloaded yet")

        self.le_dir = label_encoder_path
        if not os.path.exists(self.le_dir):
            raise Exception("BSP labels wasn't downloaded yet")

        self._load_classifier()

    def _load_classifier(self) -> None:
        """
        Load Machine Learning pre-trained model with label encoder
        """

        self.classifier = joblib.load(self.model_dir)
        self.le = joblib.load(self.le_dir)

[docs]    def predict(self, protein_vector: pd.DataFrame) -> pd.DataFrame:
        """
        Execute classification model and return best prediction
        as DataFrame with three columns:
        - "predicted_index" - predicted protein class index
        - "predicted_class" - predicted protein class name
        - "accuracy" - accuracy of prediction (0-100%)

        This method can be executed many times for different
        protein vectors

        protein_vector is represented by DataFrame with 1024
        numeric values as a result of BERT embedding
        """

        assert protein_vector.shape == self.FEATURE_SPACE

        predicted_index = self.classifier.predict(protein_vector)[0]
        predicted_class = self.le.inverse_transform([predicted_index])[0]

        predict_proba = self.classifier.predict_proba(protein_vector)[0][
            predicted_index
        ]
        accuracy = round(predict_proba * 100.0, 2)

        prediction_data = [predicted_index, predicted_class, accuracy]

        return pd.DataFrame(data=[prediction_data], columns=self.SUPPORTED_COLUMNS)