-
Notifications
You must be signed in to change notification settings - Fork 0
/
assemble_anndata.py
executable file
·51 lines (37 loc) · 1.25 KB
/
assemble_anndata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import json
import pandas as pd
import numpy as np
import scanpy as sc
import anndata as ad
# load config
config = json.load(
open("MIRSORT_ANNOTATION_DF.json")
)
expression_path = config["assemble_anndata.py"]["expression_file"]
obs_path = config["assemble_anndata.py"]["obs_file"]
version = config["assemble_anndata.py"]["version"]
ad_out_path = config["assemble_anndata.py"]["ad_path"]
fa_out_path = config["assemble_anndata.py"]["fasta_path"]
def RPMNormalize(ad,use_log1p):
ad_log = ad.copy()
ad_log.obs["total_count"] = ad_log.obs["total_count_after_preprocessing"]
np.divide(ad_log.X, ad_log.obs["total_count"][:, None] / 1e6, out=ad_log.X)
if use_log1p:
sc.pp.log1p(ad_log, base=2)
ad_log.raw = ad
return ad_log
# load data and assemble as anndata
obs_df = pd.read_csv(obs_path, index_col=0, sep="\t")
new_ad = ad.read_csv(expression_path, delimiter="\t").T
# RPMNormalize and log1p
new_ad = RPMNormalize(new_ad,use_log1p=True)
# add version number to uns
new_ad.uns["version"] = version
# save anndata
new_ad.write_h5ad(ad_out_path)
# save seqs as fasta
varnames = new_ad.var_names
ofile = open(fa_out_path, "w")
for i in range(len(varnames)):
ofile.write(">hbdx_seq_" + str(i) + "\n" +varnames[i] + "\n")
ofile.close()