-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3750899
commit c65e03e
Showing
4 changed files
with
206 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,23 +1,23 @@ | ||
import pickle | ||
|
||
from gifflar.data.utils import GlycanStorage | ||
|
||
with open("gifflar/acquisition/collected.pkl", "rb") as f: | ||
_, unique_glycans, _ = pickle.load(f) | ||
|
||
gs = GlycanStorage("/scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/data") | ||
print("Loaded GlycanStorage:", len(gs.data)) | ||
|
||
data = {} | ||
for i, iupac in enumerate(unique_glycans): | ||
try: | ||
print(f"\r{i}", end="") | ||
res = gs.query(iupac) | ||
if res: | ||
data[iupac] = res["smiles"] | ||
except Exception as e: | ||
print(e) | ||
|
||
print(len(data)) | ||
with open("glycan_smiles.pkl", "wb") as f: | ||
pickle.dump(data, f) | ||
import pickle | ||
|
||
from gifflar.data.utils import GlycanStorage | ||
|
||
with open("gifflar/acquisition/collected.pkl", "rb") as f: | ||
_, unique_glycans, _ = pickle.load(f) | ||
|
||
gs = GlycanStorage("/scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/data") | ||
print("Loaded GlycanStorage:", len(gs.data)) | ||
|
||
data = {} | ||
for i, iupac in enumerate(unique_glycans): | ||
try: | ||
print(f"\r{i}", end="") | ||
res = gs.query(iupac) | ||
if res: | ||
data[iupac] = res["smiles"] | ||
except Exception as e: | ||
print(e) | ||
|
||
print(len(data)) | ||
with open("glycan_smiles.pkl", "wb") as f: | ||
pickle.dump(data, f) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
from collections import Counter | ||
import urllib.request | ||
import pickle | ||
from pathlib import Path | ||
|
||
import numpy as np | ||
from glycowork.glycan_data import loader | ||
from glycowork.motif.graph import graph_to_string, get_possible_topologies, glycan_to_nxGraph | ||
import networkx as nx | ||
from networkx.algorithms.isomorphism import categorical_node_match | ||
|
||
|
||
print("Collecting glycan data...\n=========================") | ||
|
||
# Collect data from all sources in glycowork (assumed to be installed) and CandyCrunch | ||
if not (p := Path("glycans.pkl")).exists(): | ||
urllib.request.urlretrieve('https://github.com/BojarLab/CandyCrunch/raw/main/CandyCrunch/glycans.pkl', 'glycans.pkl') | ||
with open('glycans.pkl', 'rb') as f: | ||
candy = set(pickle.load(f)) | ||
print("Glycans in candycrunch:", len(candy)) | ||
|
||
sugarbase = set(getattr(loader, 'df_glycan')["glycan"]) | ||
print("Glycans in the sugarbase:", len(sugarbase)) | ||
|
||
# Validate that each source adds to the total number of glycans | ||
iupacs = list(sugarbase.union(candy)) | ||
print("Glycans in combined list:", len(iupacs)) | ||
|
||
expanded = [] | ||
for glycan in list(filter(lambda x: "{" in x, iupacs)): | ||
expanded += [graph_to_string(g) for g in get_possible_topologies(glycan)] | ||
iupacs = list(filter(lambda x: "{" not in x, iupacs)) + expanded | ||
print("Glycans in expanded list:", len(iupacs)) | ||
|
||
# Convert glycans to networkx graphs and hash them | ||
graphs = [glycan_to_nxGraph(iupac) for iupac in iupacs] | ||
|
||
fps = {} | ||
ids = [] | ||
for graph in graphs: | ||
h = hash(tuple(sorted(Counter(nx.get_node_attributes(graph, "string_labels").values()).items()))) | ||
if h not in fps: | ||
fps[h] = len(fps) | ||
ids.append(fps[h]) | ||
ids = np.array(ids) | ||
|
||
# Iterate over all buckets of glycans with the same hash and find isomorphisms | ||
iupacs = np.array(iupacs) | ||
graphs = np.array(graphs, dtype=object) | ||
glycans = [] | ||
isomorphisms = [] | ||
for i in range(ids.max() + 1): | ||
print(f"\r{i}", end="") | ||
mask = ids == i | ||
if sum(mask) == 1: | ||
glycans += iupacs[mask].tolist() | ||
continue | ||
graph_list = graphs[mask] | ||
iupac_list = iupacs[mask] | ||
tree_list = [nx.dfs_tree(g, max(list(g.nodes.keys()))) for g in graph_list] | ||
is_isomorphic = [False] * len(tree_list) | ||
for i in range(len(tree_list)): | ||
if is_isomorphic[i]: | ||
continue | ||
glycans.append(iupacs[i]) | ||
for j in range(i + 1, len(tree_list)): | ||
if nx.is_isomorphic(tree_list[i], tree_list[j]) and nx.is_isomorphic(graph_list[i], graph_list[j], categorical_node_match("string_labels", "")): | ||
isomorphisms.append((iupacs[i], iupacs[j])) | ||
is_isomorphic[j] = True | ||
break | ||
|
||
print("\nNumber of non-isomorphic glycans:", len(glycans)) | ||
|
||
# Visualize a pair of isomorphics glycans | ||
print("\n".join(isomorphisms[0])) | ||
|
||
# Store everything in a pickle file, first the unique iupacs, then the glycans, and finally the isomorphic pairs. The | ||
# isomorphic pairs are stored as tuples of iupacs (first the kept one, second the isomorphic, removed one). | ||
with open("glycans.pkl", "wb") as f: | ||
pickle.dump((iupacs, glycans, isomorphisms), f) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
import copy | ||
import hashlib | ||
import pickle | ||
from pathlib import Path | ||
|
||
from glycowork.motif.graph import glycan_to_nxGraph, graph_to_string | ||
|
||
glycans_path = Path("glycans.pkl") | ||
if not glycans_path.exists(): | ||
import collect_glycan_data | ||
|
||
print("Collecting subglycan data...\n============================") | ||
|
||
with open(glycans_path, "rb") as f: | ||
_, iupacs, _ = pickle.load(f) | ||
|
||
|
||
def node_label_hash(label): | ||
"""Hash function for individual node labels.""" | ||
return int(hashlib.sha256(str(label).encode()).hexdigest(), 16) | ||
|
||
|
||
def wl_relabel(G, num_iterations=3): | ||
"""Weisfeiler-Lehman relabeling for graph G with node aggregation.""" | ||
# Initialize labels with node features if available, or with node degrees | ||
labels = {node: node_label_hash(G.nodes[node].get('string_labels', G.degree(node))) | ||
for node in G.nodes} | ||
|
||
for _ in range(num_iterations): | ||
new_labels = {} | ||
for node in G.nodes: | ||
# Aggregate hashes of the neighbors | ||
neighbor_hashes = [labels[neighbor] for neighbor in G.neighbors(node)] | ||
aggregated_hash = sum(neighbor_hashes) # Sum ensures permutation invariance | ||
new_labels[node] = node_label_hash(labels[node] + aggregated_hash) | ||
labels = new_labels | ||
|
||
return labels | ||
|
||
|
||
def graph_hash(G, num_iterations=3): | ||
"""Compute a permutation-invariant hash of the entire graph.""" | ||
labels = wl_relabel(G, num_iterations) | ||
# Aggregate node labels to create the final graph hash | ||
final_hash = sum(labels.values()) # Summing ensures permutation invariance | ||
return hashlib.sha256(str(final_hash).encode()).hexdigest() | ||
|
||
|
||
def cut_and_add(glycan): | ||
h = graph_hash(glycan) | ||
if h in known: | ||
return | ||
|
||
known_iupacs.append(graph_to_string(glycan)) | ||
known.add(h) | ||
# print(h, known_iupacs[-1]) | ||
|
||
# check if glycan is a single node | ||
if len(glycan.nodes()) == 1: | ||
return | ||
|
||
leafs = [x for x in glycan.nodes() if glycan.degree(x) == 1 and x != 0] | ||
for x in leafs: | ||
G = copy.deepcopy(glycan) | ||
parent = list(G.neighbors(x))[0] | ||
G.remove_node(x) | ||
G.remove_node(parent) | ||
cut_and_add(G) | ||
|
||
|
||
known_iupacs = [] | ||
known = set() | ||
for i, iupac in enumerate(iupacs): | ||
print(f"\r{i}/{len(iupacs)}\t{iupac}", end="") | ||
cut_and_add(glycan_to_nxGraph(iupac)) | ||
|
||
with open("subglycans.pkl", "wb") as f: | ||
pickle.dump(known_iupacs, f) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
import pickle | ||
from pathlib import Path | ||
|
||
from glyles import convert | ||
|
||
glycans_path = Path("subglycans.pkl") | ||
if not glycans_path.exists(): | ||
import collect_pretrain_data | ||
|
||
print("Extracting SMILES data...\n=========================") | ||
|
||
with open(glycans_path, "rb") as f: | ||
iupacs = pickle.load(f) | ||
|
||
data = [] | ||
for iupac in iupacs: | ||
try: | ||
smiles = convert(iupac) | ||
if smiles is not None and len(smiles) > 10: | ||
data.append(iupac) | ||
except: | ||
pass | ||
|
||
with open("pretrain_glycans.txt", "w") as f: | ||
print(data, sep="\n", file=f) |