Source code for snplib.statistics._callrate

#!/usr/bin/env python
# coding: utf-8
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"

import pandas as pd


[docs] def call_rate( data: pd.DataFrame | str, id_col: str = None, snp_col: str = None ) -> pd.DataFrame | float | None: """ The call rate for a given SNP is defined as the proportion of individuals in the study for which the corresponding SNP information is not missing. In the following example, we filter using a call rate of 95%, meaning we retain SNPs for which there is less than 5% missing data. Of the say, 54K markers in the chip, 50K have been genotyped for a particular animal, the “call rate animal” is 50K/54K=93% Of the say, 900 animals genotyped for marker CL635944_160.1, how many have actually been successfully read? Assume that 600 have been read, then the “call rate marker” is 600/900 = 67% :param data: Pre-processed data on which the call rate is calculated. :param id_col: The name of the column with the id of the animals or markers. :param snp_col: The name of the column with the snp sequence. :return: Return dataframe with call rates for each animal if a dataframe is transmitted. The number if the snp sequence is passed as a string. None if there were errors. """ if isinstance(data, pd.DataFrame): try: if data[snp_col].dtype.hasobject: if not data[snp_col].str.isdigit().all(): return None return data[[id_col, snp_col]].\ groupby(by=id_col)[snp_col].\ apply(lambda x: 1 - ((x == "5").sum() / len(x))).\ reset_index() return data[[id_col, snp_col]]. \ groupby(by=id_col)[snp_col]. \ apply(lambda x: 1 - ((x == 5).sum() / len(x))). \ reset_index() except Exception as e: raise e elif isinstance(data, str): if not data.isdigit(): return None return round(1 - (data.count('5') / len(data)), 6) else: return None