#!/usr/bin/env python
# coding: utf-8
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
__all__ = ("FinalReport",)
from pathlib import Path
from functools import reduce
import re
import pandas as pd
[docs]
class FinalReport(object):
""" File that contains SNP information.
:argument allele: A variant form of a single nucleotide polymorphism
(SNP), a specific polymorphic site or a whole gene detectable at
a locus. Type: 'AB', 'Forward', 'Top', 'Plus', 'Design'
:argument sep: Delimiter to use. Default value: "\\t"
Example:
[Header]
GSGT Version 2.0.4
Processing Date 10/14/2021 4:02 PM
Content BovineSNP50_v3_A1.bpm
Num SNPs 53218
Total SNPs 53218
Num Samples 3
Total Samples 3
[Data]
SNP Name Sample ID Allele1 - AB Allele2 - AB GC Score GT Score
ABCA12 1 A A 0.4048 0.8164
APAF1 1 B B 0.9067 0.9155
...
"""
__PATTERN_HEADER = re.compile(r'(^\[Header\])')
__PATTERN_DATA = re.compile(r'(^\[Data\])')
def __init__(
self,
allele: str | list | None = None,
sep: str = "\t"
) -> None:
self._delimiter = sep
self._full_data = None
self.__header = {}
self.__snp_data = None
self.__allele = allele
self._map_rn = None
@property
def header(self) -> dict:
return self.__header
@property
def snp_data(self) -> pd.DataFrame | None:
return self.__snp_data
[docs]
def handle(
self, file_rep: Path | str, conv_file: Path | str = None
) -> bool:
""" Processes the FinalReport.txt file. Highlights meta information
and data.
:param file_rep: The file FinalReport.txt or another name.
:param conv_file: The file that contains IDs of registration numbers
of animals.
:return: Returns true if file processing was successful, false if
there were errors.
"""
try:
if isinstance(file_rep, str):
file_rep = Path(file_rep)
if not file_rep.is_file() and not file_rep.exists():
return False
# Processing conversion file
if conv_file is not None:
if isinstance(conv_file, str):
conv_file = Path(conv_file)
if not conv_file.is_file() and not conv_file.exists():
return False
self.__convert_s_id(conv_file)
# Processing report file
if not self.read(file_rep):
return False
if self._full_data is None:
raise Exception("Not data in file FinalReport.txt")
self.__handler_header()
self.__handler_data()
if self._map_rn is not None:
self.__snp_data['Sample ID'] = \
self.__snp_data['Sample ID'].map(
dict(zip(self._map_rn.SID, self._map_rn.UNIQ_KEY))
)
except Exception as e:
raise e
return True
[docs]
def read(self, file_rep: Path) -> bool:
""" Reading data from the final_report file
:param file_rep: path, pointer to the file to be read.
:return: Returns true if the read was successful, false if it failed.
"""
try:
if len(data := file_rep.read_text()) != 0:
self._full_data = data.strip().split("\n")
return True
self._full_data = None
except Exception as e:
return False
return True
def __handler_header(self) -> None:
""" Processes data from a file, selects meta-information. """
for line in self._full_data:
if self.__class__.__PATTERN_DATA.findall(line):
return
if self.__class__.__PATTERN_HEADER.findall(line):
continue
key = line.strip().split("\t")[0]
value = line.strip().split("\t")[1]
self.__header[key] = value
def __handler_data(self) -> None:
""" Processes data and forms an array for further processing. """
temp = 1
for line in self._full_data:
if self.__class__.__PATTERN_DATA.findall(line):
break
temp += 1
names_col = self.__sample_by_allele(
self._full_data[temp].split(f"{self._delimiter}")
)
if names_col is None:
raise Exception(f"Error. Allele {self.__allele} not in data.")
self.__snp_data = pd.DataFrame(
[
item_data.split(f"{self._delimiter}")
for item_data in self._full_data[temp + 1:]
],
columns=self._full_data[temp].split(f"{self._delimiter}")
)[names_col]
def __sample_by_allele(self, names: list[str]) -> list[str] | None:
""" Method that generates a list of field names choosing which alleles
to keep
:param names: List of field names in the report file.
:return: Returns a filtered list of fields by alleles.
"""
allele_templ = r'(^Allele\d\s[:-]\s{}\b)'
match self.__allele:
case None:
return names
case str():
allele_pattern = re.compile(
allele_templ.format(self.__allele)
)
case list() | tuple() | set():
allele_pattern = re.compile(
allele_templ.format("|".join(self.__allele))
)
case _:
return None
lst_allele = reduce(
lambda i, j: i + j,
[allele_pattern.findall(item) for item in names]
)
if len(lst_allele) == 0:
return None
exclude_alleles = [
item for item in names
if item.startswith("Allele") and item not in lst_allele
]
return list(filter(
lambda x: True if x not in exclude_alleles else False, names
))
def __convert_s_id(self, path_file: Path) -> None:
"""Converts sample id which is in FinalReport to animal registration
number.
:param path_file: xlsx file with animal numbers label
"""
self._map_rn = pd.read_excel(
path_file,
header=None,
names=['SID', 'UNIQ_KEY', 'SEX'],
dtype={'SID': str},
index_col=False
)
if self._map_rn.empty:
self._map_rn = None
return
self._map_rn.SID = self._map_rn.SID.str.strip()
self._map_rn.UNIQ_KEY = self._map_rn.UNIQ_KEY.str.strip()
if self._check_on_ru_symbols(self._map_rn.UNIQ_KEY):
raise Exception("Error. Unique keys contain Cyrillic alphabet.")
if self._map_rn.UNIQ_KEY.isna().any():
self._map_rn.fillna('unknown', inplace=True)
[docs]
@staticmethod
def _check_on_ru_symbols(seq: pd.Series) -> bool | None:
"""
:param seq:
:return:
"""
return any(seq.apply(lambda x: bool(re.search('[а-яА-Я]', x))))