Skip to content

Commit

Permalink
added comments
Browse files Browse the repository at this point in the history
  • Loading branch information
GiuseppeBocci authored Aug 30, 2023
1 parent a301cfa commit 14762a9
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 55 deletions.
92 changes: 48 additions & 44 deletions scripts/AssociationMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

warnings.filterwarnings('ignore')
import sys
# import numpy as np
import numpy as np
from sklearn.cluster import KMeans
import sklearn.metrics as metrics
from scipy import stats as stats
Expand All @@ -13,12 +13,7 @@
from contextlib import contextmanager
import os
from utils import EvaluationMetric
from scripts.processAssociationMatrix import *

import math


# poolAM = multiprocessing.Pool(5)

@contextmanager
def suppress_stdout():
Expand Down Expand Up @@ -137,61 +132,68 @@ def initialize(self, initialize_strategy, verbose):
self.G_left = KMeans(n_clusters=self.k1, algorithm='full').fit_transform(self.association_matrix)
self.G_left_primary = True
if self.G_right is None:
with suppress_stdout(): # TODO: full is lloyd nelle versioni successive
with suppress_stdout(): # full is lloyd
self.G_right = KMeans(n_clusters=self.k2, algorithm='full').fit_transform(
self.association_matrix.transpose())
self.G_right_primary = True
self.S = np.linalg.multi_dot([self.G_left.transpose(), self.association_matrix, self.G_right])
# TODO: added svd initialization
elif initialize_strategy == "svd":
# full_matrices is false because we want an approximation so la.svd(...) will compute only
# leading eigenvalues TODO: è ok?
# TODO: sarebbe da calcolare il tempo di
# leading eigenvalues
u, s, vh = la.svd(self.association_matrix, full_matrices=False)
# TODO
k_svd = min(self.k1, len(s))
# print("lenSVd: "+ str(len(s)))
# print("K1: " + str(self.k1))
# if G_left was already initialized the actual shape must be taken in account
if self.G_left is not None:
k_svd = min(k_svd, self.G_left.shape[1])
self.k1 = k_svd
self.k2 = k_svd
self.k2 = min(self.k2, vh.shape[0])
if verbose:
print("Association matrix filename: " + self.safe_filename)
print("Used parameters: k1 = " + str(self.k1) + " and k2 = " + str(
self.k2))
print("Non-zero elements of the association matrix = {}".format(
np.count_nonzero(self.association_matrix)))
if self.G_left is None:
with suppress_stdout():
# reduction of G_left
self.G_left = u[:self.association_matrix.shape[0], :self.k1]
# all negative values set to 0
for i in range(len(self.G_left)):
for j in range(self.k1):
if math.isnan(self.G_left[i][j]):
input("Nan in G_left " + self.filename + " >")
if self.G_left[i][j] < 0:
self.G_left[i][j] = 0
self.G_left_primary = True # TODO: a cosa serve questa variabile?
# reduction of G_left
self.G_left = u[:self.association_matrix.shape[0], :self.k1]
# all negative values set to 0
for i in range(len(self.G_left)):
for j in range(len(self.G_left[0])):
if self.G_left[i][j] < 0:
self.G_left[i][j] = 0
self.G_left_primary = True

if self.G_right is None:
with suppress_stdout():
# reduction of G_right. Need to transpose to stay consistent with other initialization methods
self.G_right = vh[:self.k2, :self.association_matrix.shape[1]].transpose()
# all negative values set to 0
for i in range(len(self.G_right)):
for j in range(self.k2):
if math.isnan(self.G_right[i][j]):
input("Nan in G_right " + self.filename + " >")
if self.G_right[i][j] < 0:
self.G_right[i][j] = 0
self.G_right_primary = True
# reduction of G_right. Need to transpose to stay consistent with other initialization methods
self.G_right = vh[:self.k2, :self.association_matrix.shape[1]].transpose()
# all negative values set to 0
for i in range(len(self.G_right)):
for j in range(len(self.G_right[0])):
if self.G_right[i][j] < 0:
self.G_right[i][j] = 0
self.G_right_primary = True
# s became an array with all eigenvalues in it
s = s[:self.k1]
# Need to discard all negative eigenvalues
for i in range(self.k1):
if math.isnan(s[i]):
input("Nan in S " + self.filename + " >")
if s[i] < 0:
s[i] = 0
len_s = len(s)
self.S = np.diag(s) # S became a diagonal matrix
if self.k2 - len_s > 0:
zeros = np.zeros((len_s, self.k2 - len_s)) # Column to add
self.S = np.hstack([self.S, zeros])
elif self.k2 - len_s < 0:
diffK = len_s - self.k2
self.S = self.S[:, :-diffK]
if self.G_left.shape[1] - len_s > 0:
zeros = np.zeros((self.G_left.shape[1] - len(self.S[0]), len(self.S[0]))) # Row to add
self.S = np.vstack([self.S, zeros])

print(np.shape(self.G_left),np.shape(self.S),np.shape(self.G_right))
# print(self.S)

for am in self.dep_own_left_other_left:
if am.G_left is None:
Expand All @@ -205,7 +207,7 @@ def initialize(self, initialize_strategy, verbose):
for am in self.dep_own_right_other_right:
if am.G_right is None:
am.G_right = self.G_right
if verbose:
if verbose or True:
print(self.leftds, self.rightds, self.association_matrix.shape)
print("Shape Factor Matrix left " + str(self.G_left.shape))
print("Shape Factor Matrix right " + str(self.G_right.shape) + "\n")
Expand Down Expand Up @@ -244,7 +246,6 @@ def validate(self, metric=EvaluationMetric.APS):

R12_2 = list(self.original_matrix[self.M == 0])
R12_found_2 = list(self.rebuilt_association_matrix[self.M == 0])
# TODO: R12_found_2 è tutti nan in svd generando un errore (errore vecchio)
if metric == EvaluationMetric.AUROC:
fpr, tpr, _ = metrics.roc_curve(R12_2, R12_found_2)
return metrics.auc(fpr, tpr)
Expand All @@ -259,7 +260,7 @@ def validate(self, metric=EvaluationMetric.APS):

def get_error(self):
self.rebuilt_association_matrix = np.linalg.multi_dot([self.G_left, self.S, self.G_right.transpose()])
# print(self.rebuilt_association_matrix) # TODO: rebuilt_association_matrix a volte è nan
# print(self.rebuilt_association_matrix)
return np.linalg.norm(self.association_matrix - self.rebuilt_association_matrix, ord='fro') ** 2

def update_G_right(self):
Expand All @@ -274,7 +275,8 @@ def update_G_right(self):
num += np.linalg.multi_dot([am.association_matrix, am.G_right, am.S.transpose()])
den += np.linalg.multi_dot(
[self.G_right, am.S, am.G_right.transpose(), am.G_right, am.S.transpose()])
div = np.divide(num, den + 0.000001) #+ sys.float_info.min) # TODO: per questo si generava l'errore
div = np.divide(num, den + 0.000001) # + sys.float_info.min)
# print(np.isnan(div))
return np.multiply(self.G_right, div)

def update_G_left(self):
Expand All @@ -290,7 +292,8 @@ def update_G_left(self):
den += np.linalg.multi_dot(
[self.G_left, am.S.transpose(), am.G_left.transpose(), am.G_left, am.S])
# print(den)
div = np.divide(num, den + 0.000001) # TODO: secondo me il problema è qui dove si fa un 0/0
div = np.divide(num, den + 0.000001)
#print(np.isnan(div))
# print(div)
return np.multiply(self.G_left, div)

Expand All @@ -299,12 +302,13 @@ def update_S(self):
den = np.linalg.multi_dot(
[self.G_left.transpose(), self.G_left, self.S, self.G_right.transpose(), self.G_right])
div = np.divide(num, den + 0.000001)
#print(np.isnan(div))
return np.multiply(self.S, div)

def update(self):
if self.G_right_primary:
self.G_right = self.update_G_right() # update_G_right(self) #
self.G_right = self.update_G_right()
if self.G_left_primary:
self.G_left = self.update_G_left() # update_G_left(self) #
self.S = self.update_S() # update_S(self) #
self.G_left = self.update_G_left()
self.S = self.update_S()

28 changes: 17 additions & 11 deletions scripts/Network.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import sys
from scipy import linalg as la
from sklearn.cluster import KMeans
from time import time


class Network:
Expand Down Expand Up @@ -106,9 +107,14 @@ def __init__(self, graph_topology_file, dirfilename, verbose, rng, mask=1):
for data in graph_topology["ranks"]:
dsname = data["dsname"]
k = data["k"]
self.dataset_ks[dsname.upper()] = int(k)
try:
self.dataset_ks[dsname.upper()] = int(k)
except:
pass
if "k_svd" in graph_topology:
self.k_svd = graph_topology["k_svd"]
self.k_svd = graph_topology["k_svd"] #TODO:svd
else:
self.k_svd = 10 # Default value

# For each category of nodes, compute the intersection or union between the different matrices
for element in graph_topology["graph.datasets"]:
Expand Down Expand Up @@ -172,13 +178,12 @@ def __init__(self, graph_topology_file, dirfilename, verbose, rng, mask=1):

# This part is setting the ranks for the different AssociationMatrix
# ( remember am = G_left^{M,k1} @ S^{k1,k2} @ G_right^{k2,N} )
# NOT CLEAR how k is used
for k in self.datasets.keys():
rank = self.select_rank(k)
for ds in self.datasets.keys():
rank = self.select_rank(ds)
for am in self.association_matrices:
if am.leftds == k:
if am.leftds == ds:
am.k1 = int(rank)
elif am.rightds == k:
elif am.rightds == ds:
am.k2 = int(rank)

# self.association_matrices = list(self.pool.starmap(initialize, [(am, self.init_strategy, False) for am in self.association_matrices]))
Expand All @@ -192,14 +197,16 @@ def __init__(self, graph_topology_file, dirfilename, verbose, rng, mask=1):
processes[i].join()
self.association_matrices[i] = results[i]"""

start_initialization_time = time()
for am in self.association_matrices:
am.initialize(self.init_strategy, verbose)

end_initialization_time = time()
print("Total Network Initialization time:", end_initialization_time-start_initialization_time)
"""for am in self.association_matrices:
am.create_update_rules()"""

# Method to calculate rank for each datatype. In case of k-means and spherical k-means initialization represents
# number of clusters. TODO: Besides in case of svd rank represents the "compression" magnitude
# number of clusters. Besides in case of svd rank represents the "compression" magnitude
def select_rank(self, ds_name):
"""
Method to calculate rank for each datatype. In case of k-means and spherical k-means initialization represents
Expand All @@ -214,7 +221,7 @@ def select_rank(self, ds_name):
int
rank
"""
# TODO: rank set by the user

if ds_name in self.dataset_ks:
rank = self.dataset_ks[ds_name]
else:
Expand Down Expand Up @@ -253,7 +260,6 @@ def get_error(self):
float :
current error of the network
"""
# print(sum([am.get_error() for am in self.association_matrices]))
return sum([am.get_error() for am in self.association_matrices])

def update(self):
Expand Down
28 changes: 28 additions & 0 deletions scripts/processNetwork.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,20 @@

# args: (dirname_1, dirname_2, rng, metric, max_iter)
def runNetworkMM(args, results, metricsArr, pos):
"""
creates a Network with Maximum Metrix evaluation
Parameters
----------
args : (dirname_1, dirname_2, rng, metric, max_iter)
args necessary to the Network to be created:
results
shared array containing the best epsilons
metricsArr
shared array containing the Metric samples every 10 iterations
pos : int
integer identifying the run
"""
max_i = args[4]
met = args[3]
V = []
Expand Down Expand Up @@ -35,6 +49,20 @@ def runNetworkMM(args, results, metricsArr, pos):

# args: (dirname_1, dirname_2, rng, metric, max_iter)
def runNetworkRE(args, results, metricsArr, pos):
"""
creates a Network with Relative Error evaluation
Parameters
----------
args : (dirname_1, dirname_2, rng, metric, max_iter)
args necessary to the Network to be created:
results
shared array containing the best epsilons
metricsArr
shared array containing the Metric samples every 10 iterations
pos : int
integer identifying the run
"""
epsilon = 0
max_i = args[4]
met = args[3]
Expand Down

0 comments on commit 14762a9

Please sign in to comment.