Skip to content

Commit

Permalink
Add recommend() function for the base class of Recommender (#538)
Browse files Browse the repository at this point in the history
  • Loading branch information
tqtg committed Oct 30, 2023
1 parent e5eb3da commit edc83aa
Show file tree
Hide file tree
Showing 62 changed files with 1,583 additions and 1,353 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ __pycache__/

# C extensions
*.so
cornac/models/*/*.cpp
cornac/models/*/cython/*.cpp
cornac/utils/*.cpp

# Distribution / packaging
bin/
Expand Down
89 changes: 24 additions & 65 deletions cornac/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,13 @@ class Dataset(object):
global_mean: float
Average value over the rating observations.
uir_tuple: tuple
Tuple three numpy arrays (user_indices, item_indices, rating_values).
timestamps: numpy.array
Numpy array of timestamps corresponding to feedback in `uir_tuple`.
This is only available when input data is in `UIRT` format.
"""

def __init__(
Expand Down Expand Up @@ -99,12 +98,8 @@ def __init__(
self.min_rating = np.min(r_values)
self.global_mean = np.mean(r_values)

self.__total_users = None
self.__total_items = None
self.__user_ids = None
self.__item_ids = None
self.__user_indices = None
self.__item_indices = None

self.__user_data = None
self.__item_data = None
Expand All @@ -114,47 +109,19 @@ def __init__(
self.__csc_matrix = None
self.__dok_matrix = None

@property
def total_users(self):
"""Total number of users including test and validation users if exists"""
return self.__total_users if self.__total_users is not None else self.num_users

@total_users.setter
def total_users(self, input_value):
"""Set total number of users for the dataset"""
assert input_value >= self.num_users
self.__total_users = input_value

@property
def total_items(self):
"""Total number of items including test and validation items if exists"""
return self.__total_items if self.__total_items is not None else self.num_items

@total_items.setter
def total_items(self, input_value):
"""Set total number of items for the dataset"""
assert input_value >= self.num_items
self.__total_items = input_value

@property
def user_ids(self):
"""An iterator over the raw user ids"""
return self.uid_map.keys()
"""Return the list of raw user ids"""
if self.__user_ids is None:
self.__user_ids = list(self.uid_map.keys())
return self.__user_ids

@property
def item_ids(self):
"""An iterator over the raw item ids"""
return self.iid_map.keys()

@property
def user_indices(self):
"""An iterator over the user indices"""
return self.uid_map.values()

@property
def item_indices(self):
"""An iterator over the item indices"""
return self.iid_map.values()
"""Return the list of raw item ids"""
if self.__item_ids is None:
self.__item_ids = list(self.iid_map.keys())
return self.__item_ids

@property
def user_data(self):
Expand Down Expand Up @@ -185,7 +152,7 @@ def item_data(self):
@property
def chrono_user_data(self):
"""Data organized by user sorted chronologically (timestamps required).
A dictionary where keys are users, values are tuples of three chronologically
A dictionary where keys are users, values are tuples of three chronologically
sorted lists (items, ratings, timestamps) interacted by the corresponding users.
"""
if self.timestamps is None:
Expand Down Expand Up @@ -214,7 +181,7 @@ def chrono_user_data(self):
@property
def chrono_item_data(self):
"""Data organized by item sorted chronologically (timestamps required).
A dictionary where keys are items, values are tuples of three chronologically
A dictionary where keys are items, values are tuples of three chronologically
sorted lists (users, ratings, timestamps) interacted with the corresponding items.
"""
if self.timestamps is None:
Expand Down Expand Up @@ -272,7 +239,7 @@ def dok_matrix(self):
"""The user-item interaction matrix in DOK sparse format"""
if self.__dok_matrix is None:
self.__dok_matrix = dok_matrix(
(self.num_users, self.num_items), dtype='float'
(self.num_users, self.num_items), dtype="float"
)
for u, i, r in zip(*self.uir_tuple):
self.__dok_matrix[u, i] = r
Expand Down Expand Up @@ -364,27 +331,29 @@ def build(
raise ValueError("data is empty after being filtered!")

uir_tuple = (
np.asarray(u_indices, dtype='int'),
np.asarray(i_indices, dtype='int'),
np.asarray(r_values, dtype='float'),
np.asarray(u_indices, dtype="int"),
np.asarray(i_indices, dtype="int"),
np.asarray(r_values, dtype="float"),
)

timestamps = (
np.fromiter((int(data[i][3]) for i in valid_idx), dtype='int')
np.fromiter((int(data[i][3]) for i in valid_idx), dtype="int")
if fmt == "UIRT"
else None
)

return cls(
dataset = cls(
num_users=len(global_uid_map),
num_items=len(global_iid_map),
uid_map=uid_map,
iid_map=iid_map,
uid_map=global_uid_map,
iid_map=global_iid_map,
uir_tuple=uir_tuple,
timestamps=timestamps,
seed=seed,
)

return dataset

@classmethod
def from_uir(cls, data, seed=None):
"""Constructing Dataset from UIR (User, Item, Rating) triplet data.
Expand All @@ -407,7 +376,7 @@ def from_uir(cls, data, seed=None):

@classmethod
def from_uirt(cls, data, seed=None):
"""Constructing Dataset from UIRT (User, Item, Rating, Timestamp)
"""Constructing Dataset from UIRT (User, Item, Rating, Timestamp)
quadruplet data.
Parameters
Expand Down Expand Up @@ -528,7 +497,6 @@ def uij_iter(self, batch_size=1, shuffle=False, neg_sampling="uniform"):
batch of negative items (array of 'int')
"""

if neg_sampling.lower() == "uniform":
neg_population = np.arange(self.num_items)
elif neg_sampling.lower() == "popularity":
Expand Down Expand Up @@ -564,7 +532,7 @@ def user_iter(self, batch_size=1, shuffle=False):
-------
iterator : batch of user indices (array of 'int')
"""
user_indices = np.fromiter(self.user_indices, dtype='int')
user_indices = np.fromiter(set(self.uir_tuple[0]), dtype="int")
for batch_ids in self.idx_iter(len(user_indices), batch_size, shuffle):
yield user_indices[batch_ids]

Expand All @@ -582,18 +550,10 @@ def item_iter(self, batch_size=1, shuffle=False):
-------
iterator : batch of item indices (array of 'int')
"""
item_indices = np.fromiter(self.item_indices, 'int')
item_indices = np.fromiter(set(self.uir_tuple[1]), "int")
for batch_ids in self.idx_iter(len(item_indices), batch_size, shuffle):
yield item_indices[batch_ids]

def is_unk_user(self, user_idx):
"""Return whether or not a user is unknown given the user index"""
return user_idx >= self.num_users

def is_unk_item(self, item_idx):
"""Return whether or not an item is unknown given the item index"""
return item_idx >= self.num_items

def add_modalities(self, **kwargs):
self.user_feature = kwargs.get("user_feature", None)
self.item_feature = kwargs.get("item_feature", None)
Expand All @@ -605,4 +565,3 @@ def add_modalities(self, **kwargs):
self.item_graph = kwargs.get("item_graph", None)
self.sentiment = kwargs.get("sentiment", None)
self.review_text = kwargs.get("review_text", None)

21 changes: 10 additions & 11 deletions cornac/eval_methods/base_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def rating_eval(model, metrics, test_set, user_based=False, verbose=False):
gt_mat = test_set.csr_matrix
pd_mat = csr_matrix((r_preds, (u_indices, i_indices)), shape=gt_mat.shape)

test_user_indices = set(u_indices)
for mt in metrics:
if user_based: # averaging over users
user_results.append(
Expand All @@ -93,7 +94,7 @@ def rating_eval(model, metrics, test_set, user_based=False, verbose=False):
gt_ratings=gt_mat.getrow(user_idx).data,
pd_ratings=pd_mat.getrow(user_idx).data,
).item()
for user_idx in test_set.user_indices
for user_idx in test_user_indices
}
)
avg_results.append(sum(user_results[-1].values()) / len(user_results[-1]))
Expand Down Expand Up @@ -159,7 +160,7 @@ def ranking_eval(
avg_results = []
user_results = [{} for _ in enumerate(metrics)]

gt_mat = test_set.csr_matrix
test_mat = test_set.csr_matrix
train_mat = train_set.csr_matrix
val_mat = None if val_set is None else val_set.csr_matrix

Expand All @@ -170,10 +171,11 @@ def pos_items(csr_row):
if rating >= rating_threshold
]

test_user_indices = set(test_set.uir_tuple[0])
for user_idx in tqdm(
test_set.user_indices, desc="Ranking", disable=not verbose, miniters=100
test_user_indices, desc="Ranking", disable=not verbose, miniters=100
):
test_pos_items = pos_items(gt_mat.getrow(user_idx))
test_pos_items = pos_items(test_mat.getrow(user_idx))
if len(test_pos_items) == 0:
continue

Expand All @@ -183,9 +185,9 @@ def pos_items(csr_row):

val_pos_items = [] if val_mat is None else pos_items(val_mat.getrow(user_idx))
train_pos_items = (
[]
if train_set.is_unk_user(user_idx)
else pos_items(train_mat.getrow(user_idx))
pos_items(train_mat.getrow(user_idx))
if user_idx < train_mat.shape[0]
else []
)

# binary mask for ground-truth negative items, removing all positive items
Expand All @@ -196,7 +198,7 @@ def pos_items(csr_row):
if exclude_unknowns:
u_gt_pos_mask = u_gt_pos_mask[: train_set.num_items]
u_gt_neg_mask = u_gt_neg_mask[: train_set.num_items]

item_indices = np.nonzero(u_gt_pos_mask + u_gt_neg_mask)[0]
u_gt_pos_items = np.nonzero(u_gt_pos_mask)[0]
u_gt_neg_items = np.nonzero(u_gt_neg_mask)[0]
Expand Down Expand Up @@ -538,9 +540,6 @@ def _build_datasets(self, train_data, test_data, val_data=None):
print("Total users = {}".format(self.total_users))
print("Total items = {}".format(self.total_items))

self.train_set.total_users = self.total_users
self.train_set.total_items = self.total_items

def _build_modalities(self):
for user_modality in [
self.user_feature,
Expand Down
39 changes: 19 additions & 20 deletions cornac/eval_methods/propensity_stratified_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,38 +25,38 @@ def ranking_eval(
props=None,
):
"""Evaluate model on provided ranking metrics.
Parameters
----------
model: :obj:`cornac.models.Recommender`, required
Recommender model to be evaluated.
metrics: :obj:`iterable`, required
List of rating metrics :obj:`cornac.metrics.RankingMetric`.
train_set: :obj:`cornac.data.Dataset`, required
Dataset to be used for model training. This will be used to exclude
observations already appeared during training.
test_set: :obj:`cornac.data.Dataset`, required
Dataset to be used for evaluation.
val_set: :obj:`cornac.data.Dataset`, optional, default: None
Dataset to be used for model selection. This will be used to exclude
observations already appeared during validation.
rating_threshold: float, optional, default: 1.0
The threshold to convert ratings into positive or negative feedback.
exclude_unknowns: bool, optional, default: True
Ignore unknown users and items during evaluation.
verbose: bool, optional, default: False
Output evaluation progress.
props: dictionary, optional, default: None
items propensity scores
Returns
-------
res: (List, List)
Expand All @@ -82,12 +82,13 @@ def pos_items(csr_row):
if rating >= rating_threshold
]

for user_idx in tqdm.tqdm(test_set.user_indices, disable=not verbose, miniters=100):
test_user_indices = set(test_set.uir_tuple[0])
for user_idx in tqdm.tqdm(test_user_indices, disable=not verbose, miniters=100):
test_pos_items = pos_items(gt_mat.getrow(user_idx))
if len(test_pos_items) == 0:
continue

u_gt_pos = np.zeros(test_set.num_items, dtype='float')
u_gt_pos = np.zeros(test_set.num_items, dtype="float")
u_gt_pos[test_pos_items] = 1

val_pos_items = [] if val_mat is None else pos_items(val_mat.getrow(user_idx))
Expand All @@ -97,7 +98,7 @@ def pos_items(csr_row):
else pos_items(train_mat.getrow(user_idx))
)

u_gt_neg = np.ones(test_set.num_items, dtype='int')
u_gt_neg = np.ones(test_set.num_items, dtype="int")
u_gt_neg[test_pos_items + val_pos_items + train_pos_items] = 0

item_indices = None if exclude_unknowns else np.arange(test_set.num_items)
Expand Down Expand Up @@ -256,7 +257,7 @@ def _estimate_propensities(self):
item_freq[i] += 1

# fit the exponential param
data = np.array([e for e in item_freq.values()], dtype='float')
data = np.array([e for e in item_freq.values()], dtype="float")
results = powerlaw.Fit(data, discrete=True, fit_method="Likelihood")
alpha = results.power_law.alpha
fmin = results.power_law.xmin
Expand All @@ -276,9 +277,7 @@ def _build_stratified_dataset(self, test_data):
self.stratified_sets = {}

# match the corresponding propensity score for each feedback
test_props = np.array(
[self.props[i] for u, i, r in test_data], dtype='float'
)
test_props = np.array([self.props[i] for u, i, r in test_data], dtype="float")

# stratify
minp = min(test_props) - 0.01 * min(test_props)
Expand Down Expand Up @@ -338,11 +337,11 @@ def evaluate(self, model, metrics, user_based, show_validation=True):
metrics: :obj:`iterable`
List of metrics.
user_based: bool, required
Evaluation strategy for the rating metrics. Whether results
user_based: bool, required
Evaluation strategy for the rating metrics. Whether results
are averaging based on number of users or number of ratings.
show_validation: bool, optional, default: True
show_validation: bool, optional, default: True
Whether to show the results on validation set (if exists).
Returns
Expand Down
Loading

0 comments on commit edc83aa

Please sign in to comment.