Skip to content

Commit

Permalink
forces ID based ordering of samples for all processed outputs. closes #…
Browse files Browse the repository at this point in the history
  • Loading branch information
oganm committed May 23, 2024
1 parent 614074a commit c9bde5f
Showing 1 changed file with 20 additions and 9 deletions.
29 changes: 20 additions & 9 deletions gemmapy/_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,11 +384,22 @@ def process_expression(d, dataset, api):
# more non-uniformities
df = sub.read_tsv(d)
m_cols = list(df.columns)
samples = api.raw.get_dataset_samples(dataset).data
sample_ids = sub.field_in_list(samples,"sample",'name')
sample_names = sub.field_in_list(samples,"name")
# we use the order returned by get_dataset_samples as authoritative which makes
# it a bit awkward when we need to access a property left out of the processed output
# this could be simplified by ordering by ids but I don't want to break
# get_dataseset_samples supremacy in case ordering changes later
# R package makes this easier by appending raw outputs to every processed
# results. we don't do that here so calling twice is necessary
samples = api.get_dataset_samples(dataset)
samples_raw = api.raw.get_dataset_samples(dataset).data

sample_ids = [x.replace("|",".") for x in sample_ids]
raw_ids = sub.field_in_list(samples_raw,'sample','id')
sample_internal_names = sub.field_in_list(samples_raw,'sample','name')
sample_internal_names = sub.match_by(sample_internal_names, samples.sample_ID, raw_ids)

sample_names = samples.sample_name

sample_internal_names = [x.replace("|",".") for x in sample_internal_names]

def find_match(x):
match = None
Expand All @@ -397,7 +408,7 @@ def find_match(x):
match = i
return match

sample_matches = [find_match(x) for x in sample_ids]
sample_matches = [find_match(x) for x in sample_internal_names]


rename_dict = sub.make_dict([m_cols[i] for i in sample_matches],sample_names)
Expand All @@ -408,13 +419,13 @@ def find_match(x):

df = df.drop(columns=['Sequence', 'GemmaId'], errors='ignore')

non_samples = [x for x in list(df.columns) if not x in sample_names]
df = df.reindex(columns = non_samples + sample_names)
non_samples = [x for x in list(df.columns) if not x in list(sample_names)]
df = df.reindex(columns = non_samples +list(sample_names))

return df

def process_samples(d:list):

df = pd.DataFrame({
"sample_name": sub.field_in_list(d,"name"),
"sample_ID": sub.field_in_list(d,"sample",'id'),
Expand All @@ -427,7 +438,7 @@ def process_samples(d:list):
"sample_factor_values": [sub.process_FactorValueValueObject_list(x)
for x in sub.field_in_list(d,"sample","factor_value_objects")]

})
}).sort_values(by= ['sample_ID'],ignore_index=True)

return df

Expand Down

0 comments on commit c9bde5f

Please sign in to comment.