From 91639402752bac4dc816438c0fc0ceafd0ab8113 Mon Sep 17 00:00:00 2001 From: Hong Zhang Date: Mon, 27 Feb 2023 17:24:48 +0800 Subject: [PATCH] update train with principal tx option --- psite/train.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/psite/train.py b/psite/train.py index 17b5c5b..7507c5b 100644 --- a/psite/train.py +++ b/psite/train.py @@ -54,9 +54,25 @@ def get_txrep(txinfo, type_rep='longest', path_exp=None, ignore_version=False): txinfo = txinfo.loc[txinfo['transcript_biotype'] == 'protein_coding'] if type_rep == 'longest': txrep = txinfo.sort_values(['gene_id', 'tx_len'], ascending=[True, False]) - txrep = txrep.groupby('gene_id').first().reset_index() + txrep = txrep.groupby('gene_id').first().reset_index(drop=True) elif type_rep == 'principal': - txrep = txinfo.loc[(txinfo['txtype'] == 'principal')] + try: + tx_extra = pd.read_table(path_exp) + tx_extra.rename(columns={ + 'Gene stable ID': 'gene_id', 'Transcript stable ID': 'tx_name', + 'APPRIS annotation': 'appris', 'Ensembl Canonical': 'canonical', + 'RefSeq match transcript (MANE Select)': 'mane_select'}, inplace=True) + tx_extra = tx_extra[~tx_extra.appris.isna()] + tx_extra = tx_extra.assign(appris=pd.Categorical(tx_extra.appris, categories=['principal1', + 'principal2', 'principal3', 'principal4', 'principal5', 'alternative1', 'alternative2'])) + tx_extra = tx_extra.sort_values(by='appris').reset_index(drop=True) + appris_prin = ['principal1', 'principal2', 'principal3', 'principal4', 'principal5'] + tx_extra = tx_extra[tx_extra.appris.isin(appris_prin)] + tx_extra = tx_extra[~tx_extra.duplicated(subset=['gene_id'])] + txrep = txinfo[txinfo.tx_name.isin(tx_extra.tx_name)] + except: + print('input extra txinfo file is incorrect', file=sys.stderr) + exit(1) elif type_rep == 'salmon': # salmon_output_dir/quant.sf try: tx_quant = pd.read_csv(path_exp, sep='\t') @@ -68,8 +84,8 @@ def get_txrep(txinfo, type_rep='longest', path_exp=None, ignore_version=False): tx_quant = tx_quant.groupby('gene_id').first().reset_index() txrep = txinfo.loc[(txinfo['tx_name'].isin(tx_quant['Name']))] except: - print('input salmon quant results incorrect', file=sys.stderr) - exit() + print('input salmon quant results is incorrect', file=sys.stderr) + exit(1) elif type_rep == 'kallisto': # kallisto_output_dir/abundance.tsv try: tx_quant = pd.read_csv(path_exp, sep='\t') @@ -81,11 +97,11 @@ def get_txrep(txinfo, type_rep='longest', path_exp=None, ignore_version=False): tx_quant = tx_quant.groupby('gene_id').first().reset_index() txrep = txinfo.loc[(txinfo['tx_name'].isin(tx_quant['target_id']))] except: - print('input kallisto quant results incorrect', file=sys.stderr) - exit() + print('input kallisto quant results is incorrect', file=sys.stderr) + exit(1) else: print('Incorrect txinfo_rep option!', file=sys.stderr) - exit() + exit(1) return txrep