Skip to content

Commit

Permalink
scripts for analysis of studies in the BIA collection
Browse files Browse the repository at this point in the history
  • Loading branch information
sherwoodf committed Dec 12, 2024
1 parent 089ea7b commit ec7cffa
Show file tree
Hide file tree
Showing 5 changed files with 244 additions and 0 deletions.
3 changes: 3 additions & 0 deletions bia-ingest/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ typer = "^0.12.3"
typing-extensions = "^4.12.2"
pydantic-settings = "^2.3.4"
rich-tools = "^0.5.1"
scipy = "^1.14.1"
scikit-learn = "^1.5.2"
matplotlib = "^3.9.2"

[tool.poetry.scripts]
biaingest = "bia_ingest.cli:app"
Expand Down
Empty file added bia-ingest/scripts/__init__.py
Empty file.
138 changes: 138 additions & 0 deletions bia-ingest/scripts/biostudies_fields_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import logging
from rich import print
from rich.logging import RichHandler
from bia_ingest.biostudies.api import load_submission, Section
from pydantic import BaseModel, ValidationError
from typing import List, Optional
import json
import csv
import time

logging.basicConfig(
level=logging.INFO, format="%(message)s", handlers=[RichHandler(show_time=False)]
)
logger = logging.getLogger()


class SearchResult(BaseModel):
accession: str
type: str
title: str
author: str
links: int
files: int
release_date: str
views: int
isPublic: bool


class SearchPage(BaseModel):
page: int
pageSize: int
totalHits: int
isTotalHitsExact: bool
sortBy: str
sortOrder: str
hits: List[SearchResult]
query: Optional[str]
facets: Optional[str]


def count_section_type(dict_to_edit: dict, section_to_count: str):
if section_to_count not in dict_to_edit.keys():
dict_to_edit[section_to_count] = 0
dict_to_edit[section_to_count] += 1


def recurse_subsection(section: Section, dict_count: dict):
count_section_type(dict_count, section.type)
if len(section.subsections) > 0:
for section in section.subsections:
if type(section) is list:
pass
else:
recurse_subsection(section, dict_count)


if __name__ == "__main__":
with open("all_bia_studies_in_biostudies.json", "r") as f:
result = json.load(f)

EMPIAR_studies = []
BIAD_studies = []
BSST_studies = []
SJCBD_studies = []
other_studies = []

for study in result:
accession_id: str = study["accession"]
if accession_id.startswith("EMPIAR"):
EMPIAR_studies.append(study)
elif accession_id.startswith("S-BIAD"):
BIAD_studies.append(study)
elif accession_id.startswith("S-BSST"):
BSST_studies.append(study)
elif accession_id.startswith("S-JCBD"):
SJCBD_studies.append(study)
else:
other_studies.append(study)

print(f"EMPIAR: {len(EMPIAR_studies)}")
print(f"BIAD: {len(BIAD_studies)}")
print(f"BSST: {len(BSST_studies)}")
print(f"SJCBD: {len(SJCBD_studies)}")
print(f"other: {len(other_studies)}")

for study in other_studies:
print(study["accession"])

print("-----")

target_studies = BIAD_studies + BSST_studies + other_studies

count_rows = []
bool_rows = []
for study in target_studies:
print(study["accession"])
try:
submission = load_submission(study["accession"])
except AssertionError:
time.sleep(0.25)
try:
submission = load_submission(study["accession"])
except AssertionError:
print(f"gave up on: {study['accession']} due to non 200 response")
continue
except ValidationError:
print(f"gave up on: {study['accession']} due to validation error")
continue
count_section_types = {"accno": submission.accno}
first_section = submission.section
recurse_subsection(first_section, count_section_types)
count_rows.append(count_section_types)
bool_section_types = {"accno": submission.accno}
for key, count in count_section_types.items():
if not key == "accno":
if count > 0:
bool_section_types[key] = 1
else:
bool_section_types[key] = 0
bool_rows.append(bool_section_types)

field_names = []
for row in count_rows:
for key in row.keys():
if key not in field_names:
field_names.append(key)

with open("biad_field_counts.csv", "w") as f:
wr = csv.DictWriter(f, field_names)
wr.writeheader()
for row in count_rows:
wr.writerow(row)

with open("biad_field_presence.csv", "w") as f:
wr = csv.DictWriter(f, field_names)
wr.writeheader()
for row in bool_rows:
wr.writerow(row)
39 changes: 39 additions & 0 deletions bia-ingest/scripts/clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np


def colour_code_data(row: pd.Series, row_map: list):
row_out = row.replace(1, row_map[row.name] + 1)
return row_out


if __name__ == "__main__":
section_types_count = pd.read_csv("biad_field_presence.csv").fillna(value = 0)
just_numbers = section_types_count.drop(columns="accno")
study_accno = section_types_count["accno"].to_numpy()

clusters = 4
kmeans = KMeans(n_clusters=clusters, random_state=20)
kmeans_labels = kmeans.fit_predict(just_numbers).tolist()

clustered_studies = {key: list() for key in range(clusters)}
for i, value in enumerate(kmeans_labels):
clustered_studies[value].append(study_accno[i])

[print(value) for value in clustered_studies.values()]

out = just_numbers.apply(
colour_code_data, args=([kmeans_labels]), axis=1, result_type="broadcast"
)

plt.figure()
plt.imshow(out)
ax = plt.gca()
ax.set_yticks(
np.arange(len(study_accno)),
labels=study_accno,
)

plt.show()
64 changes: 64 additions & 0 deletions bia-ingest/scripts/get_all_bia_studies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import requests
from pydantic.json import pydantic_encoder
from pydantic import BaseModel
from typing import Optional, List
import json


class SearchResult(BaseModel):
accession: str
type: str
title: str
author: str
links: int
files: int
release_date: str
views: int
isPublic: bool


class SearchPage(BaseModel):
page: int
pageSize: int
totalHits: int
isTotalHitsExact: bool
sortBy: str
sortOrder: str
hits: List[SearchResult]
query: Optional[str]
facets: Optional[str]


BIOSTUDIES_COLLECTION_SEARCH = "https://www.ebi.ac.uk/biostudies/api/v1/BioImages/search?pageSize={page_size}&page={page_number}"


def get_all_bia_studies(page_size: int, total_hits: int):
results = []
last_page = total_hits // page_size + 1
page_count = 1
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
}
while page_count <= last_page:

url = BIOSTUDIES_COLLECTION_SEARCH.format(
page_size=page_size, page_number=page_count
)

r = requests.get(url, headers=headers)
assert r.status_code == 200
search_page = SearchPage.model_validate_json(r.content)
for hit in search_page.hits:
results.append(hit)
page_count += 1
return results


# Get studies from BIOSTUDIES api and store them so we don't keep calling their API
if __name__ == "__main__":
total_hits = 3048
page_size = 100
results = get_all_bia_studies(page_size, total_hits)

with open("all_bia_studies_in_biostudies.json", "w") as f:
f.write(json.dumps(results, default=pydantic_encoder))

0 comments on commit ec7cffa

Please sign in to comment.