Skip to content

Commit

Permalink
More data acquisition code
Browse files Browse the repository at this point in the history
  • Loading branch information
Old-Shatterhand committed Sep 2, 2024
1 parent 3750899 commit c65e03e
Show file tree
Hide file tree
Showing 4 changed files with 206 additions and 23 deletions.
46 changes: 23 additions & 23 deletions gifflar/acquisition/candycrunsh.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
import pickle

from gifflar.data.utils import GlycanStorage

with open("gifflar/acquisition/collected.pkl", "rb") as f:
_, unique_glycans, _ = pickle.load(f)

gs = GlycanStorage("/scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/data")
print("Loaded GlycanStorage:", len(gs.data))

data = {}
for i, iupac in enumerate(unique_glycans):
try:
print(f"\r{i}", end="")
res = gs.query(iupac)
if res:
data[iupac] = res["smiles"]
except Exception as e:
print(e)

print(len(data))
with open("glycan_smiles.pkl", "wb") as f:
pickle.dump(data, f)
import pickle

from gifflar.data.utils import GlycanStorage

with open("gifflar/acquisition/collected.pkl", "rb") as f:
_, unique_glycans, _ = pickle.load(f)

gs = GlycanStorage("/scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/data")
print("Loaded GlycanStorage:", len(gs.data))

data = {}
for i, iupac in enumerate(unique_glycans):
try:
print(f"\r{i}", end="")
res = gs.query(iupac)
if res:
data[iupac] = res["smiles"]
except Exception as e:
print(e)

print(len(data))
with open("glycan_smiles.pkl", "wb") as f:
pickle.dump(data, f)
80 changes: 80 additions & 0 deletions gifflar/acquisition/collect_glycan_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from collections import Counter
import urllib.request
import pickle
from pathlib import Path

import numpy as np
from glycowork.glycan_data import loader
from glycowork.motif.graph import graph_to_string, get_possible_topologies, glycan_to_nxGraph
import networkx as nx
from networkx.algorithms.isomorphism import categorical_node_match


print("Collecting glycan data...\n=========================")

# Collect data from all sources in glycowork (assumed to be installed) and CandyCrunch
if not (p := Path("glycans.pkl")).exists():
urllib.request.urlretrieve('https://github.com/BojarLab/CandyCrunch/raw/main/CandyCrunch/glycans.pkl', 'glycans.pkl')
with open('glycans.pkl', 'rb') as f:
candy = set(pickle.load(f))
print("Glycans in candycrunch:", len(candy))

sugarbase = set(getattr(loader, 'df_glycan')["glycan"])
print("Glycans in the sugarbase:", len(sugarbase))

# Validate that each source adds to the total number of glycans
iupacs = list(sugarbase.union(candy))
print("Glycans in combined list:", len(iupacs))

expanded = []
for glycan in list(filter(lambda x: "{" in x, iupacs)):
expanded += [graph_to_string(g) for g in get_possible_topologies(glycan)]
iupacs = list(filter(lambda x: "{" not in x, iupacs)) + expanded
print("Glycans in expanded list:", len(iupacs))

# Convert glycans to networkx graphs and hash them
graphs = [glycan_to_nxGraph(iupac) for iupac in iupacs]

fps = {}
ids = []
for graph in graphs:
h = hash(tuple(sorted(Counter(nx.get_node_attributes(graph, "string_labels").values()).items())))
if h not in fps:
fps[h] = len(fps)
ids.append(fps[h])
ids = np.array(ids)

# Iterate over all buckets of glycans with the same hash and find isomorphisms
iupacs = np.array(iupacs)
graphs = np.array(graphs, dtype=object)
glycans = []
isomorphisms = []
for i in range(ids.max() + 1):
print(f"\r{i}", end="")
mask = ids == i
if sum(mask) == 1:
glycans += iupacs[mask].tolist()
continue
graph_list = graphs[mask]
iupac_list = iupacs[mask]
tree_list = [nx.dfs_tree(g, max(list(g.nodes.keys()))) for g in graph_list]
is_isomorphic = [False] * len(tree_list)
for i in range(len(tree_list)):
if is_isomorphic[i]:
continue
glycans.append(iupacs[i])
for j in range(i + 1, len(tree_list)):
if nx.is_isomorphic(tree_list[i], tree_list[j]) and nx.is_isomorphic(graph_list[i], graph_list[j], categorical_node_match("string_labels", "")):
isomorphisms.append((iupacs[i], iupacs[j]))
is_isomorphic[j] = True
break

print("\nNumber of non-isomorphic glycans:", len(glycans))

# Visualize a pair of isomorphics glycans
print("\n".join(isomorphisms[0]))

# Store everything in a pickle file, first the unique iupacs, then the glycans, and finally the isomorphic pairs. The
# isomorphic pairs are stored as tuples of iupacs (first the kept one, second the isomorphic, removed one).
with open("glycans.pkl", "wb") as f:
pickle.dump((iupacs, glycans, isomorphisms), f)
78 changes: 78 additions & 0 deletions gifflar/acquisition/collect_pretrain_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import copy
import hashlib
import pickle
from pathlib import Path

from glycowork.motif.graph import glycan_to_nxGraph, graph_to_string

glycans_path = Path("glycans.pkl")
if not glycans_path.exists():
import collect_glycan_data

print("Collecting subglycan data...\n============================")

with open(glycans_path, "rb") as f:
_, iupacs, _ = pickle.load(f)


def node_label_hash(label):
"""Hash function for individual node labels."""
return int(hashlib.sha256(str(label).encode()).hexdigest(), 16)


def wl_relabel(G, num_iterations=3):
"""Weisfeiler-Lehman relabeling for graph G with node aggregation."""
# Initialize labels with node features if available, or with node degrees
labels = {node: node_label_hash(G.nodes[node].get('string_labels', G.degree(node)))
for node in G.nodes}

for _ in range(num_iterations):
new_labels = {}
for node in G.nodes:
# Aggregate hashes of the neighbors
neighbor_hashes = [labels[neighbor] for neighbor in G.neighbors(node)]
aggregated_hash = sum(neighbor_hashes) # Sum ensures permutation invariance
new_labels[node] = node_label_hash(labels[node] + aggregated_hash)
labels = new_labels

return labels


def graph_hash(G, num_iterations=3):
"""Compute a permutation-invariant hash of the entire graph."""
labels = wl_relabel(G, num_iterations)
# Aggregate node labels to create the final graph hash
final_hash = sum(labels.values()) # Summing ensures permutation invariance
return hashlib.sha256(str(final_hash).encode()).hexdigest()


def cut_and_add(glycan):
h = graph_hash(glycan)
if h in known:
return

known_iupacs.append(graph_to_string(glycan))
known.add(h)
# print(h, known_iupacs[-1])

# check if glycan is a single node
if len(glycan.nodes()) == 1:
return

leafs = [x for x in glycan.nodes() if glycan.degree(x) == 1 and x != 0]
for x in leafs:
G = copy.deepcopy(glycan)
parent = list(G.neighbors(x))[0]
G.remove_node(x)
G.remove_node(parent)
cut_and_add(G)


known_iupacs = []
known = set()
for i, iupac in enumerate(iupacs):
print(f"\r{i}/{len(iupacs)}\t{iupac}", end="")
cut_and_add(glycan_to_nxGraph(iupac))

with open("subglycans.pkl", "wb") as f:
pickle.dump(known_iupacs, f)
25 changes: 25 additions & 0 deletions gifflar/acquisition/collect_smiles_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import pickle
from pathlib import Path

from glyles import convert

glycans_path = Path("subglycans.pkl")
if not glycans_path.exists():
import collect_pretrain_data

print("Extracting SMILES data...\n=========================")

with open(glycans_path, "rb") as f:
iupacs = pickle.load(f)

data = []
for iupac in iupacs:
try:
smiles = convert(iupac)
if smiles is not None and len(smiles) > 10:
data.append(iupac)
except:
pass

with open("pretrain_glycans.txt", "w") as f:
print(data, sep="\n", file=f)

0 comments on commit c65e03e

Please sign in to comment.