scripts for analysis of studies in the BIA collection

BioImage-Archive · Dec 12, 2024 · ec7cffa · ec7cffa
1 parent 089ea7b
commit ec7cffa
Show file tree

Hide file tree

Showing 5 changed files with 244 additions and 0 deletions.
diff --git a/bia-ingest/pyproject.toml b/bia-ingest/pyproject.toml
@@ -18,6 +18,9 @@ typer = "^0.12.3"
 typing-extensions = "^4.12.2"
 pydantic-settings = "^2.3.4"
 rich-tools = "^0.5.1"
+scipy = "^1.14.1"
+scikit-learn = "^1.5.2"
+matplotlib = "^3.9.2"
 
 [tool.poetry.scripts]
 biaingest = "bia_ingest.cli:app"

diff --git a/bia-ingest/scripts/__init__.py b/bia-ingest/scripts/__init__.py
diff --git a/bia-ingest/scripts/biostudies_fields_stats.py b/bia-ingest/scripts/biostudies_fields_stats.py
@@ -0,0 +1,138 @@
+import logging
+from rich import print
+from rich.logging import RichHandler
+from bia_ingest.biostudies.api import load_submission, Section
+from pydantic import BaseModel, ValidationError
+from typing import List, Optional
+import json
+import csv
+import time
+
+logging.basicConfig(
+    level=logging.INFO, format="%(message)s", handlers=[RichHandler(show_time=False)]
+)
+logger = logging.getLogger()
+
+
+class SearchResult(BaseModel):
+    accession: str
+    type: str
+    title: str
+    author: str
+    links: int
+    files: int
+    release_date: str
+    views: int
+    isPublic: bool
+
+
+class SearchPage(BaseModel):
+    page: int
+    pageSize: int
+    totalHits: int
+    isTotalHitsExact: bool
+    sortBy: str
+    sortOrder: str
+    hits: List[SearchResult]
+    query: Optional[str]
+    facets: Optional[str]
+
+
+def count_section_type(dict_to_edit: dict, section_to_count: str):
+    if section_to_count not in dict_to_edit.keys():
+        dict_to_edit[section_to_count] = 0
+    dict_to_edit[section_to_count] += 1
+
+
+def recurse_subsection(section: Section, dict_count: dict):
+    count_section_type(dict_count, section.type)
+    if len(section.subsections) > 0:
+        for section in section.subsections:
+            if type(section) is list:
+                pass
+            else:
+                recurse_subsection(section, dict_count)
+
+
+if __name__ == "__main__":
+    with open("all_bia_studies_in_biostudies.json", "r") as f:
+        result = json.load(f)
+
+    EMPIAR_studies = []
+    BIAD_studies = []
+    BSST_studies = []
+    SJCBD_studies = []
+    other_studies = []
+
+    for study in result:
+        accession_id: str = study["accession"]
+        if accession_id.startswith("EMPIAR"):
+            EMPIAR_studies.append(study)
+        elif accession_id.startswith("S-BIAD"):
+            BIAD_studies.append(study)
+        elif accession_id.startswith("S-BSST"):
+            BSST_studies.append(study)
+        elif accession_id.startswith("S-JCBD"):
+            SJCBD_studies.append(study)
+        else:
+            other_studies.append(study)
+
+    print(f"EMPIAR: {len(EMPIAR_studies)}")
+    print(f"BIAD: {len(BIAD_studies)}")
+    print(f"BSST: {len(BSST_studies)}")
+    print(f"SJCBD: {len(SJCBD_studies)}")
+    print(f"other: {len(other_studies)}")
+
+    for study in other_studies:
+        print(study["accession"])
+
+    print("-----")
+
+    target_studies = BIAD_studies + BSST_studies + other_studies
+
+    count_rows = []
+    bool_rows = []
+    for study in target_studies:
+        print(study["accession"])
+        try:
+            submission = load_submission(study["accession"])
+        except AssertionError:
+            time.sleep(0.25)
+            try:
+                submission = load_submission(study["accession"])
+            except AssertionError:
+                print(f"gave up on: {study['accession']} due to non 200 response")
+                continue
+        except ValidationError:
+            print(f"gave up on: {study['accession']} due to validation error")
+            continue
+        count_section_types = {"accno": submission.accno}
+        first_section = submission.section
+        recurse_subsection(first_section, count_section_types)
+        count_rows.append(count_section_types)
+        bool_section_types = {"accno": submission.accno}
+        for key, count in count_section_types.items():
+            if not key == "accno":
+                if count > 0:
+                    bool_section_types[key] = 1
+                else:
+                    bool_section_types[key] = 0
+        bool_rows.append(bool_section_types)
+
+    field_names = []
+    for row in count_rows:
+        for key in row.keys():
+            if key not in field_names:
+                field_names.append(key)
+
+    with open("biad_field_counts.csv", "w") as f:
+        wr = csv.DictWriter(f, field_names)
+        wr.writeheader()
+        for row in count_rows:
+            wr.writerow(row)
+
+    with open("biad_field_presence.csv", "w") as f:
+        wr = csv.DictWriter(f, field_names)
+        wr.writeheader()
+        for row in bool_rows:
+            wr.writerow(row)
diff --git a/bia-ingest/scripts/clustering.py b/bia-ingest/scripts/clustering.py
@@ -0,0 +1,39 @@
+import pandas as pd
+from sklearn.cluster import KMeans
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def colour_code_data(row: pd.Series, row_map: list):
+    row_out = row.replace(1, row_map[row.name] + 1)
+    return row_out
+
+
+if __name__ == "__main__":
+    section_types_count = pd.read_csv("biad_field_presence.csv").fillna(value = 0)
+    just_numbers = section_types_count.drop(columns="accno")
+    study_accno = section_types_count["accno"].to_numpy()
+
+    clusters = 4
+    kmeans = KMeans(n_clusters=clusters, random_state=20)
+    kmeans_labels = kmeans.fit_predict(just_numbers).tolist()
+
+    clustered_studies = {key: list() for key in range(clusters)}
+    for i, value in enumerate(kmeans_labels):
+        clustered_studies[value].append(study_accno[i])
+
+    [print(value) for value in clustered_studies.values()]
+
+    out = just_numbers.apply(
+        colour_code_data, args=([kmeans_labels]), axis=1, result_type="broadcast"
+    )
+
+    plt.figure()
+    plt.imshow(out)
+    ax = plt.gca()
+    ax.set_yticks(
+        np.arange(len(study_accno)),
+        labels=study_accno,
+    )
+
+    plt.show()
diff --git a/bia-ingest/scripts/get_all_bia_studies.py b/bia-ingest/scripts/get_all_bia_studies.py
@@ -0,0 +1,64 @@
+import requests
+from pydantic.json import pydantic_encoder
+from pydantic import BaseModel
+from typing import Optional, List
+import json
+
+
+class SearchResult(BaseModel):
+    accession: str
+    type: str
+    title: str
+    author: str
+    links: int
+    files: int
+    release_date: str
+    views: int
+    isPublic: bool
+
+
+class SearchPage(BaseModel):
+    page: int
+    pageSize: int
+    totalHits: int
+    isTotalHitsExact: bool
+    sortBy: str
+    sortOrder: str
+    hits: List[SearchResult]
+    query: Optional[str]
+    facets: Optional[str]
+
+
+BIOSTUDIES_COLLECTION_SEARCH = "https://www.ebi.ac.uk/biostudies/api/v1/BioImages/search?pageSize={page_size}&page={page_number}"
+
+
+def get_all_bia_studies(page_size: int, total_hits: int):
+    results = []
+    last_page = total_hits // page_size + 1
+    page_count = 1
+    headers = {
+        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
+    }
+    while page_count <= last_page:
+
+        url = BIOSTUDIES_COLLECTION_SEARCH.format(
+            page_size=page_size, page_number=page_count
+        )
+
+        r = requests.get(url, headers=headers)
+        assert r.status_code == 200
+        search_page = SearchPage.model_validate_json(r.content)
+        for hit in search_page.hits:
+            results.append(hit)
+        page_count += 1
+    return results
+
+
+# Get studies from BIOSTUDIES api and store them so we don't keep calling their API
+if __name__ == "__main__":
+    total_hits = 3048
+    page_size = 100
+    results = get_all_bia_studies(page_size, total_hits)
+
+    with open("all_bia_studies_in_biostudies.json", "w") as f:
+        f.write(json.dumps(results, default=pydantic_encoder))