Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ignores missing entries in .Rtab files #158

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyseer/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ def read_variant(infile, p, var_type, burden, burden_regions,
if present == '1':
d[sample] = 1
elif present == ".":
d[sample] = np.nan
continue

# Use common dictionary to format design matrix etc
kstrains = sorted(set(d.keys()).intersection(all_strains)) # This will include missing
Expand Down
28 changes: 28 additions & 0 deletions tests/input_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
KMER = os.path.join(DATA_DIR, 'kmers.gz')
PRES = os.path.join(DATA_DIR, 'presence_absence_smaller.Rtab')
PRESSPACE = os.path.join(DATA_DIR, 'presence_absence_smaller_space.Rtab')
PRESMISSING = os.path.join(DATA_DIR, 'presence_absence_missing.Rtab')
VCF = os.path.join(DATA_DIR, 'variants_smaller.vcf.gz')
VCFNOGT = os.path.join(DATA_DIR, 'variants_no_gt.vcf.gz')
VCFMISSING = os.path.join(DATA_DIR, 'variants_missing.vcf.gz')
Expand Down Expand Up @@ -424,6 +425,33 @@ def test_read_variant_rtab(self):
self.assertEqual(af, 1.0)
self.assertEqual(missing, 0.0)

def test_read_variant_rtab_missing(self):
infile = open(PRESMISSING)
p = pd.read_csv(P, index_col=0, sep='\t')['binary']
header = infile.readline().rstrip()
sample_order = header.split()[1:]
# check header
self.assertEqual(sample_order, list(p.index))
# check first gene
t = read_variant(infile, p, 'Rtab', False, [], False,
p.index, sample_order)
self.assertEqual(
list(t[1]),
[0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0,
0, 1, 0, 0, 0, 0, 1, 0])
# read whole file
vars = []
while not t[0]:
vars.append(t)
t = read_variant(infile, p, 'Rtab', False, [], False,
p.index, sample_order)
_, _, var_names, _, _, afs, _ = zip(*vars)
# test allele frequencies
self.assertEqual(
list(afs),
[0.32, 0.52, 0.44, 0.5, 0.46, 0.48, 0.38, 0.46, 0.4, 0.44])

def test_read_variant_vcf(self):
p = pd.read_csv(P,
index_col=0,
Expand Down
11 changes: 11 additions & 0 deletions tests/presence_absence_missing.Rtab
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
Gene sample_1 sample_2 sample_3 sample_4 sample_5 sample_6 sample_7 sample_8 sample_9 sample_10 sample_11 sample_12 sample_13 sample_14 sample_15 sample_16 sample_17 sample_18 sample_19 sample_20 sample_21 sample_22 sample_23 sample_24 sample_25 sample_26 sample_27 sample_28 sample_29 sample_30 sample_31 sample_32 sample_33 sample_34 sample_35 sample_36 sample_37 sample_38 sample_39 sample_40 sample_41 sample_42 sample_43 sample_44 sample_45 sample_46 sample_47 sample_48 sample_49 sample_50
gene_1 0 0 1 0 0 . 1 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 1 1 0 0 1 0 1 1 0 0 1 0 . 1 0 0 . . 1 0
gene_2 0 0 1 0 . 0 1 0 1 1 1 0 . 1 1 1 0 0 1 0 0 0 0 0 1 1 0 1 0 1 0 0 1 1 1 1 0 0 0 0 1 1 1 1 1 0 1 1 1 1
gene_3 0 . 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 1 1 1 0 0 1 0 0 1 1 1 0 . . 1 1 0 1 1 1 . 1 . 0 1
gene_4 0 1 1 1 1 0 0 0 1 1 1 0 0 1 0 0 1 0 0 1 1 1 0 0 . 1 . 0 . . 1 1 1 0 0 1 1 0 1 1 1 0 0 1 0 1 1 0 1 0
gene_5 1 1 0 . 0 0 0 0 1 1 0 1 0 1 1 1 0 . 0 . 1 1 0 1 0 0 0 0 1 1 0 0 1 0 1 1 1 1 1 0 . 1 1 1 1 0 0 0 0 0
gene_6 1 1 1 1 1 1 1 0 0 1 0 0 0 . 1 0 1 0 1 0 0 . 0 1 . 1 0 . 1 0 1 0 0 1 1 1 . 0 0 0 0 1 1 1 1 0 1 0 1 0
gene_7 1 1 0 0 1 0 1 1 . 0 1 0 1 0 . 0 0 1 0 1 . 1 0 0 0 0 1 1 1 0 1 0 . 1 0 . 1 0 1 0 0 0 0 0 1 1 0 0 0 0
gene_8 0 0 0 . 0 0 1 1 0 0 1 1 0 . 1 0 1 0 1 1 1 0 0 . 0 1 1 0 0 0 1 0 1 0 1 0 1 0 1 0 1 1 1 0 1 1 1 1 0 0
gene_9 0 1 1 . 1 0 1 1 0 0 . 1 0 0 0 1 0 1 0 0 0 1 1 0 0 0 1 . 1 . 0 1 1 1 0 . 0 . 1 0 1 1 . 0 0 1 0 0 1 0
gene_10 1 0 1 1 1 1 . 1 0 1 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 1 0 0 1 1 0 1 1 1 0 . 1 1 0 0 1 . . 1 0 0 0 0 1 0