Source code for phages2050.embeddings.nucleotides.word2vec
import os
import base64
from io import BytesIO
from zipfile import ZipFile
from typing import Dict
from pathlib import Path
import requests
from gensim.models.word2vec import Word2Vec
from fake_useragent import UserAgent
[docs]class Word2VecModelManager:
"""
Manager class is responsible to download and unzip
Word2Vec pre-trained model for nucleotides embedding
"""
WORD2VEC_URL = base64.b64decode(
"aHR0cHM6Ly9kZWVwcGV0cmkuYWkvc3RhdGljL3BoYWdlczIwNTAv"
"d29yZDJ2ZWMtZW1iZWRkaW5nLTIxLjA3LjIwMjAuemlw"
)
STATUS_CODE_200 = 200
def __init__(self, model_dir: str = "word2vec_model"):
self.model_dir = model_dir
if not os.path.exists(model_dir):
os.mkdir(self.model_dir)
@staticmethod
def _get_headers() -> Dict:
"""
Return header dict with random User-Agent to support request
and to avoid being blocked by the server
"""
ua = UserAgent()
ua.update()
return {"User-Agent": ua.random}
[docs] def download_model(self) -> Path:
"""
Download Word2Vec pre-trained model and unzip it into directory
This procedure should be executed once and the result
loaded by Word2VecEmbedding class instance
"""
path = Path(self.model_dir)
# If model directory exists then return it immediately
if os.path.exists(path) and os.listdir(path):
print("[DEBUG] Word2Vec model exists")
return path
else:
print("[DEBUG] Word2Vec model is downloading now")
headers = self._get_headers()
with requests.get(self.WORD2VEC_URL, headers=headers) as response:
assert response.status_code == self.STATUS_CODE_200
with ZipFile(BytesIO(response.content)) as zip_file:
zip_file.extractall(self.model_dir)
return path
[docs]class Word2VecEmbedding:
"""
Word2Vec instance loader class
"""
def __init__(self, model_pkl_file: str):
"""
Pickle file need to be serialized by Word2Vec.save method
before it will be loader with this class
"""
self.model_pkl_file = model_pkl_file
if not os.path.exists(self.model_pkl_file):
raise Exception("Word2Vec model wasn't downloaded yet")
self.model = Word2Vec.load(self.model_pkl_file)
self.feature_space = self.model.vector_size
[docs] def get_train_params(self) -> Exception:
"""
TODO: return dict with model train parameters
"""
raise NotImplemented