-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
scripts for analysis of studies in the BIA collection
- Loading branch information
Showing
5 changed files
with
244 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
import logging | ||
from rich import print | ||
from rich.logging import RichHandler | ||
from bia_ingest.biostudies.api import load_submission, Section | ||
from pydantic import BaseModel, ValidationError | ||
from typing import List, Optional | ||
import json | ||
import csv | ||
import time | ||
|
||
logging.basicConfig( | ||
level=logging.INFO, format="%(message)s", handlers=[RichHandler(show_time=False)] | ||
) | ||
logger = logging.getLogger() | ||
|
||
|
||
class SearchResult(BaseModel): | ||
accession: str | ||
type: str | ||
title: str | ||
author: str | ||
links: int | ||
files: int | ||
release_date: str | ||
views: int | ||
isPublic: bool | ||
|
||
|
||
class SearchPage(BaseModel): | ||
page: int | ||
pageSize: int | ||
totalHits: int | ||
isTotalHitsExact: bool | ||
sortBy: str | ||
sortOrder: str | ||
hits: List[SearchResult] | ||
query: Optional[str] | ||
facets: Optional[str] | ||
|
||
|
||
def count_section_type(dict_to_edit: dict, section_to_count: str): | ||
if section_to_count not in dict_to_edit.keys(): | ||
dict_to_edit[section_to_count] = 0 | ||
dict_to_edit[section_to_count] += 1 | ||
|
||
|
||
def recurse_subsection(section: Section, dict_count: dict): | ||
count_section_type(dict_count, section.type) | ||
if len(section.subsections) > 0: | ||
for section in section.subsections: | ||
if type(section) is list: | ||
pass | ||
else: | ||
recurse_subsection(section, dict_count) | ||
|
||
|
||
if __name__ == "__main__": | ||
with open("all_bia_studies_in_biostudies.json", "r") as f: | ||
result = json.load(f) | ||
|
||
EMPIAR_studies = [] | ||
BIAD_studies = [] | ||
BSST_studies = [] | ||
SJCBD_studies = [] | ||
other_studies = [] | ||
|
||
for study in result: | ||
accession_id: str = study["accession"] | ||
if accession_id.startswith("EMPIAR"): | ||
EMPIAR_studies.append(study) | ||
elif accession_id.startswith("S-BIAD"): | ||
BIAD_studies.append(study) | ||
elif accession_id.startswith("S-BSST"): | ||
BSST_studies.append(study) | ||
elif accession_id.startswith("S-JCBD"): | ||
SJCBD_studies.append(study) | ||
else: | ||
other_studies.append(study) | ||
|
||
print(f"EMPIAR: {len(EMPIAR_studies)}") | ||
print(f"BIAD: {len(BIAD_studies)}") | ||
print(f"BSST: {len(BSST_studies)}") | ||
print(f"SJCBD: {len(SJCBD_studies)}") | ||
print(f"other: {len(other_studies)}") | ||
|
||
for study in other_studies: | ||
print(study["accession"]) | ||
|
||
print("-----") | ||
|
||
target_studies = BIAD_studies + BSST_studies + other_studies | ||
|
||
count_rows = [] | ||
bool_rows = [] | ||
for study in target_studies: | ||
print(study["accession"]) | ||
try: | ||
submission = load_submission(study["accession"]) | ||
except AssertionError: | ||
time.sleep(0.25) | ||
try: | ||
submission = load_submission(study["accession"]) | ||
except AssertionError: | ||
print(f"gave up on: {study['accession']} due to non 200 response") | ||
continue | ||
except ValidationError: | ||
print(f"gave up on: {study['accession']} due to validation error") | ||
continue | ||
count_section_types = {"accno": submission.accno} | ||
first_section = submission.section | ||
recurse_subsection(first_section, count_section_types) | ||
count_rows.append(count_section_types) | ||
bool_section_types = {"accno": submission.accno} | ||
for key, count in count_section_types.items(): | ||
if not key == "accno": | ||
if count > 0: | ||
bool_section_types[key] = 1 | ||
else: | ||
bool_section_types[key] = 0 | ||
bool_rows.append(bool_section_types) | ||
|
||
field_names = [] | ||
for row in count_rows: | ||
for key in row.keys(): | ||
if key not in field_names: | ||
field_names.append(key) | ||
|
||
with open("biad_field_counts.csv", "w") as f: | ||
wr = csv.DictWriter(f, field_names) | ||
wr.writeheader() | ||
for row in count_rows: | ||
wr.writerow(row) | ||
|
||
with open("biad_field_presence.csv", "w") as f: | ||
wr = csv.DictWriter(f, field_names) | ||
wr.writeheader() | ||
for row in bool_rows: | ||
wr.writerow(row) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import pandas as pd | ||
from sklearn.cluster import KMeans | ||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
|
||
|
||
def colour_code_data(row: pd.Series, row_map: list): | ||
row_out = row.replace(1, row_map[row.name] + 1) | ||
return row_out | ||
|
||
|
||
if __name__ == "__main__": | ||
section_types_count = pd.read_csv("biad_field_presence.csv").fillna(value = 0) | ||
just_numbers = section_types_count.drop(columns="accno") | ||
study_accno = section_types_count["accno"].to_numpy() | ||
|
||
clusters = 4 | ||
kmeans = KMeans(n_clusters=clusters, random_state=20) | ||
kmeans_labels = kmeans.fit_predict(just_numbers).tolist() | ||
|
||
clustered_studies = {key: list() for key in range(clusters)} | ||
for i, value in enumerate(kmeans_labels): | ||
clustered_studies[value].append(study_accno[i]) | ||
|
||
[print(value) for value in clustered_studies.values()] | ||
|
||
out = just_numbers.apply( | ||
colour_code_data, args=([kmeans_labels]), axis=1, result_type="broadcast" | ||
) | ||
|
||
plt.figure() | ||
plt.imshow(out) | ||
ax = plt.gca() | ||
ax.set_yticks( | ||
np.arange(len(study_accno)), | ||
labels=study_accno, | ||
) | ||
|
||
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
import requests | ||
from pydantic.json import pydantic_encoder | ||
from pydantic import BaseModel | ||
from typing import Optional, List | ||
import json | ||
|
||
|
||
class SearchResult(BaseModel): | ||
accession: str | ||
type: str | ||
title: str | ||
author: str | ||
links: int | ||
files: int | ||
release_date: str | ||
views: int | ||
isPublic: bool | ||
|
||
|
||
class SearchPage(BaseModel): | ||
page: int | ||
pageSize: int | ||
totalHits: int | ||
isTotalHitsExact: bool | ||
sortBy: str | ||
sortOrder: str | ||
hits: List[SearchResult] | ||
query: Optional[str] | ||
facets: Optional[str] | ||
|
||
|
||
BIOSTUDIES_COLLECTION_SEARCH = "https://www.ebi.ac.uk/biostudies/api/v1/BioImages/search?pageSize={page_size}&page={page_number}" | ||
|
||
|
||
def get_all_bia_studies(page_size: int, total_hits: int): | ||
results = [] | ||
last_page = total_hits // page_size + 1 | ||
page_count = 1 | ||
headers = { | ||
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" | ||
} | ||
while page_count <= last_page: | ||
|
||
url = BIOSTUDIES_COLLECTION_SEARCH.format( | ||
page_size=page_size, page_number=page_count | ||
) | ||
|
||
r = requests.get(url, headers=headers) | ||
assert r.status_code == 200 | ||
search_page = SearchPage.model_validate_json(r.content) | ||
for hit in search_page.hits: | ||
results.append(hit) | ||
page_count += 1 | ||
return results | ||
|
||
|
||
# Get studies from BIOSTUDIES api and store them so we don't keep calling their API | ||
if __name__ == "__main__": | ||
total_hits = 3048 | ||
page_size = 100 | ||
results = get_all_bia_studies(page_size, total_hits) | ||
|
||
with open("all_bia_studies_in_biostudies.json", "w") as f: | ||
f.write(json.dumps(results, default=pydantic_encoder)) |