Skip to content

Commit

Permalink
Merge pull request #81 from monarch-initiative/fix-genesets
Browse files Browse the repository at this point in the history
fixing gene sets and updating analysis
  • Loading branch information
cmungall authored May 2, 2023
2 parents 81a243d + 0da860e commit df38985
Show file tree
Hide file tree
Showing 55 changed files with 5,219 additions and 12,308 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ analysis/enrichment/%-results-$(N).yaml: tests/input/genesets/%.yaml
$(RUN) ontogpt -vv eval-enrichment -n $(N) -U $< -o $@.tmp && mv $@.tmp $@

analysis/enrichment-summary.yaml:
cat analysis/enrichment/*-$(N)yaml > $@
cat analysis/enrichment/*-$(N).yaml > $@

analysis/enrichment-summary-$(N).yaml:
cat analysis/enrichment/*-$(N).yaml > $@
Expand Down
17,091 changes: 5,100 additions & 11,991 deletions notebooks/Enrichment-Results-Analysis.ipynb

Large diffs are not rendered by default.

24 changes: 16 additions & 8 deletions src/ontogpt/engines/enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Tuple, Union

from jinja2 import Template
from oaklib import BasicOntologyInterface, get_adapter
Expand Down Expand Up @@ -120,9 +120,9 @@ def summarize(
raise NotImplementedError
if not gene_set.gene_ids and not gene_set.gene_symbols:
raise ValueError(f"Gene set {gene_set.name} has no gene symbols or ids")
if gene_set.gene_ids and not gene_set.gene_symbols:
adapter = list(self.label_resolvers.values())[0]
gene_set.gene_symbols = [adapter.label(x.lower()) for x in gene_set.gene_ids]
# if gene_set.gene_ids and not gene_set.gene_symbols:
# adapter = list(self.label_resolvers.values())[0]
# gene_set.gene_symbols = [adapter.label(x.lower()) for x in gene_set.gene_ids]
if not gene_set.gene_ids or normalize:
gene_set.gene_ids = list(self.map_labels(gene_set.gene_symbols, strict=strict))
logger.info(f"gene ids: {gene_set.gene_ids}")
Expand Down Expand Up @@ -152,7 +152,10 @@ def summarize(
if not prompt_template:
prompt_template = str(f"{DEFAULT_ENRICHMENT_PROMPT}.jinja2")
prompt, tf = self._prompt_from_template(
gene_tuples, template=prompt_template, annotations=annotations
gene_tuples,
template=prompt_template,
annotations=annotations,
taxon=gene_set.taxon,
)
response_text = self.client.complete(prompt, max_tokens=self.completion_length)
response_token_length = len(self.encoding.encode(response_text))
Expand All @@ -173,8 +176,9 @@ def summarize(
def _prompt_from_template(
self,
genes: List[GENE_TUPLE],
template: str,
template: Union[str, Path, Template],
truncation_factor=1.0,
taxon: str = None,
annotations=True,
) -> Tuple[str, float]:
if isinstance(template, Path):
Expand All @@ -195,22 +199,26 @@ def _prompt_from_template(
prompt = template.render(
gene_descriptions=gd_tuples,
annotations=annotations,
taxon=taxon,
SUMMARY_KEYWORD=SUMMARY_KEYWORD,
MECHANISM_KEYWORD=MECHANISM_KEYWORD,
ENRICHED_TERMS_KEYWORD=ENRICHED_TERMS_KEYWORD,
)
logging.debug(f"Prompt from template: {prompt}")
logging.info(f"Prompt [{truncation_factor}] Length: {len(prompt)}")
# https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
prompt_length = len(self.encoding.encode(prompt))
logging.info(f"Prompt [{truncation_factor}] Tokens: {prompt_length} Strlen={len(prompt)}")
prompt_length = len(self.encoding.encode(prompt)) + 10
max_len = 4097 - self.completion_length
logging.info(
f"Prompt [{truncation_factor}] Toks: {prompt_length} / {max_len} Str={len(prompt)}"
)
if prompt_length > max_len: # TODO: check this
logging.warning(f"Prompt is too long; toks: {prompt_length} len: {len(prompt)}")
return self._prompt_from_template(
genes,
template,
truncation_factor=truncation_factor * 0.8,
taxon=taxon,
annotations=annotations,
)
return prompt, truncation_factor
Expand Down
6 changes: 1 addition & 5 deletions src/ontogpt/prompts/enrichment/gene_set_summarization.jinja2
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
{% if annotations %}
I will give you a list of genes together with descriptions of their functions.
{% else %}
I will give you a list of genes.
{% endif %}
I will give you a list of {{ taxon }} genes {% if annotations %} together with descriptions of their functions{% endif %}.
Perform a term enrichment test on these genes.
i.e. tell me what the commonalities are in their function.
Make use of classification hierarchies when you do this.
Expand Down
3 changes: 3 additions & 0 deletions src/ontogpt/utils/gene_set_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@ class GeneSet(BaseModel):
gene_symbols: Optional[List[str]] = None
gene_ids: Optional[List[str]] = None
taxon: str = "human"
taxon_id: Optional[str] = None
description: Optional[str] = None
source: Optional[str] = None
source_url: Optional[str] = None
target_term_ids: Optional[List[str]] = None


Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_ADIPOGENESIS.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_ADIPOGENESIS
gene_symbols: []
gene_ids:

gene_symbols:
- ABCA1
- ABCB8
- ACAA2
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_ALLOGRAFT_REJECTION.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_ALLOGRAFT_REJECTION
gene_symbols: []
gene_ids:

gene_symbols:
- AARS1
- ABCE1
- ABI1
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_ANDROGEN_RESPONSE.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_ANDROGEN_RESPONSE
gene_symbols: []
gene_ids:

gene_symbols:
- ABCC4
- ABHD2
- ACSL3
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_ANGIOGENESIS.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_ANGIOGENESIS
gene_symbols: []
gene_ids:

gene_symbols:
- APOH
- APP
- CCND2
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_APICAL_JUNCTION.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_APICAL_JUNCTION
gene_symbols: []
gene_ids:

gene_symbols:
- ACTA1
- ACTB
- ACTC1
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_APICAL_SURFACE.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_APICAL_SURFACE
gene_symbols: []
gene_ids:

gene_symbols:
- ADAM10
- ADIPOR2
- AFAP1L2
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_APOPTOSIS.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_APOPTOSIS
gene_symbols: []
gene_ids:

gene_symbols:
- ADD1
- AIFM3
- ANKH
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_BILE_ACID_METABOLISM.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_BILE_ACID_METABOLISM
gene_symbols: []
gene_ids:

gene_symbols:
- ABCA1
- ABCA2
- ABCA3
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_CHOLESTEROL_HOMEOSTASIS.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_CHOLESTEROL_HOMEOSTASIS
gene_symbols: []
gene_ids:

gene_symbols:
- ABCA2
- ACAT2
- ACSS2
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_COAGULATION.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_COAGULATION
gene_symbols: []
gene_ids:

gene_symbols:
- A2M
- ACOX2
- ADAM9
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_COMPLEMENT.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_COMPLEMENT
gene_symbols: []
gene_ids:

gene_symbols:
- ACTN2
- ADAM9
- ADRA2B
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_DNA_REPAIR.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_DNA_REPAIR
gene_symbols: []
gene_ids:

gene_symbols:
- AAAS
- ADA
- ADCY6
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_E2F_TARGETS.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_E2F_TARGETS
gene_symbols: []
gene_ids:

gene_symbols:
- AK2
- ANP32E
- ASF1A
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION
gene_symbols: []
gene_ids:

gene_symbols:
- ABI3BP
- ACTA2
- ADAM12
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_ESTROGEN_RESPONSE_EARLY.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_ESTROGEN_RESPONSE_EARLY
gene_symbols: []
gene_ids:

gene_symbols:
- ABAT
- ABCA3
- ABHD2
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_ESTROGEN_RESPONSE_LATE.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_ESTROGEN_RESPONSE_LATE
gene_symbols: []
gene_ids:

gene_symbols:
- ABCA3
- ABHD2
- ACOX2
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_FATTY_ACID_METABOLISM.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_FATTY_ACID_METABOLISM
gene_symbols: []
gene_ids:

gene_symbols:
- AADAT
- ACAA1
- ACAA2
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_G2M_CHECKPOINT.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_G2M_CHECKPOINT
gene_symbols: []
gene_ids:

gene_symbols:
- ABL1
- AMD1
- ARID4A
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_GLYCOLYSIS.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_GLYCOLYSIS
gene_symbols: []
gene_ids:

gene_symbols:
- ABCB6
- ADORA2B
- AGL
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_HEDGEHOG_SIGNALING.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_HEDGEHOG_SIGNALING
gene_symbols: []
gene_ids:

gene_symbols:
- ACHE
- ADGRG1
- AMOT
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_HEME_METABOLISM.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_HEME_METABOLISM
gene_symbols: []
gene_ids:

gene_symbols:
- ABCB6
- ABCG2
- ACKR1
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_HYPOXIA.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_HYPOXIA
gene_symbols: []
gene_ids:

gene_symbols:
- ACKR3
- ADM
- ADORA2B
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_IL2_STAT5_SIGNALING.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_IL2_STAT5_SIGNALING
gene_symbols: []
gene_ids:

gene_symbols:
- ABCB1
- ADAM19
- AGER
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_IL6_JAK_STAT3_SIGNALING.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_IL6_JAK_STAT3_SIGNALING
gene_symbols: []
gene_ids:

gene_symbols:
- A2M
- ACVR1B
- ACVRL1
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_INFLAMMATORY_RESPONSE.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_INFLAMMATORY_RESPONSE
gene_symbols: []
gene_ids:

gene_symbols:
- ABCA1
- ABI1
- ACVR1B
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_INTERFERON_ALPHA_RESPONSE.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_INTERFERON_ALPHA_RESPONSE
gene_symbols: []
gene_ids:

gene_symbols:
- ADAR
- B2M
- BATF2
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_INTERFERON_GAMMA_RESPONSE.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_INTERFERON_GAMMA_RESPONSE
gene_symbols: []
gene_ids:

gene_symbols:
- ADAR
- APOL6
- ARID5B
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_KRAS_SIGNALING_DN.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_KRAS_SIGNALING_DN
gene_symbols: []
gene_ids:

gene_symbols:
- ABCB11
- ABCG4
- ACTC1
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_KRAS_SIGNALING_UP.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_KRAS_SIGNALING_UP
gene_symbols: []
gene_ids:

gene_symbols:
- ABCB1
- ACE
- ADAM17
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_MITOTIC_SPINDLE.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_MITOTIC_SPINDLE
gene_symbols: []
gene_ids:

gene_symbols:
- ABI1
- ABL1
- ABR
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_MTORC1_SIGNALING.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_MTORC1_SIGNALING
gene_symbols: []
gene_ids:

gene_symbols:
- ABCF2
- ACACA
- ACLY
Expand Down
4 changes: 2 additions & 2 deletions tests/input/genesets/HALLMARK_MYC_TARGETS_V1.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: HALLMARK_MYC_TARGETS_V1
gene_symbols: []
gene_ids:

gene_symbols:
- ABCE1
- ACP1
- AIMP2
Expand Down
Loading

0 comments on commit df38985

Please sign in to comment.