Source code for libreco.data.transformed
"""Transformed Dataset."""
from collections import defaultdict
from random import seed as set_random_seed
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from .consumed import interaction_consumed
from ..sampling import negatives_from_unconsumed
[docs]class TransformedSet:
"""Dataset after transforming.
Often generated by calling functions in ``DatasetPure`` or ``DatasetFeat``,
then ``TransformedSet`` will be used in formal training.
Parameters
----------
user_indices : numpy.ndarray
All user rows in data, represented in inner id.
item_indices : numpy.ndarray
All item rows in data, represented in inner id.
labels : numpy.ndarray
All labels in data.
sparse_indices : numpy.ndarray or None, default: None
All sparse rows in data, represented in inner id.
dense_values : numpy.ndarray or None, default: None
All dense rows in data.
See Also
--------
:class:`~libreco.data.dataset.DatasetPure`
:class:`~libreco.data.dataset.DatasetFeat`
"""
def __init__(
self,
user_indices=None,
item_indices=None,
labels=None,
sparse_indices=None,
dense_values=None,
):
self._user_indices = user_indices
self._item_indices = item_indices
self._labels = labels
self._sparse_indices = sparse_indices
self._dense_values = dense_values
self._sparse_interaction = self.construct_sparse()
def construct_sparse(self):
interaction = pd.DataFrame(
{"user": self.user_indices, "item": self.item_indices, "label": self.labels}
)
interaction = interaction.drop_duplicates(subset=["user", "item"], keep="last")
user_indices = interaction["user"].to_numpy()
item_indices = interaction["item"].to_numpy()
labels = interaction["label"].to_numpy()
return csr_matrix((labels, (user_indices, item_indices)), dtype=np.float32)
def __len__(self):
return len(self.labels)
def __getitem__(self, index):
"""Get a slice of data."""
return self.user_indices[index], self.item_indices[index], self.labels[index]
@property
def user_indices(self):
"""All user rows in data"""
return self._user_indices
@property
def item_indices(self):
"""All item rows in data"""
return self._item_indices
@property
def sparse_indices(self):
"""All sparse rows in data"""
return self._sparse_indices
@property
def dense_values(self):
"""All dense rows in data"""
return self._dense_values
@property
def labels(self):
"""All labels in data"""
return self._labels
@property
def sparse_interaction(self):
"""User-item interaction data, in :class:`scipy.sparse.csr_matrix` format."""
return self._sparse_interaction
[docs]class TransformedEvalSet:
"""Dataset after transforming.
Often generated by calling functions in ``DatasetPure`` or ``DatasetFeat``,
then ``TransformedEvalSet`` will be used in evaluation and testing.
Parameters
----------
user_indices : numpy.ndarray
All user rows in data, represented in inner id.
item_indices : numpy.ndarray
All item rows in data, represented in inner id.
labels : numpy.ndarray
All labels in data.
"""
def __init__(self, user_indices, item_indices, labels):
self.user_indices = np.asarray(user_indices)
self.item_indices = np.asarray(item_indices)
self.labels = np.asarray(labels)
self.has_sampled = False
self.positive_consumed = self._get_positive_consumed()
# noinspection PyUnresolvedReferences
def _get_positive_consumed(self):
# data without label column has dummy labels 0
label_all_positive = np.all(np.asarray(self.labels) == 0)
user_indices = self.user_indices.tolist()
item_indices = self.item_indices.tolist()
labels = self.labels.tolist()
user_consumed = defaultdict(list)
for u, i, lb in zip(user_indices, item_indices, labels):
if label_all_positive or lb != 0:
user_consumed[u].append(i)
return {u: np.unique(items).tolist() for u, items in user_consumed.items()}
[docs] def build_negatives(self, n_items, num_neg, seed):
"""Perform negative sampling on all the data contained.
Parameters
----------
n_items : int
Number of total items.
num_neg : int
Number of negative samples for each positive sample.
seed : int
Random seed.
"""
set_random_seed(seed)
self.has_sampled = True
# use original users and items to sample
items_neg = self._sample_neg_items(
self.user_indices, self.item_indices, n_items, num_neg
)
self.user_indices = np.repeat(self.user_indices, num_neg + 1)
self.item_indices = np.repeat(self.item_indices, num_neg + 1)
self.labels = np.zeros_like(self.item_indices, dtype=np.float32)
self.labels[:: (num_neg + 1)] = 1.0
assert len(self.item_indices) == len(items_neg) * (num_neg + 1) / num_neg
for i in range(num_neg):
self.item_indices[(i + 1) :: (num_neg + 1)] = items_neg[i::num_neg]
def _sample_neg_items(self, users, items, n_items, num_neg):
user_consumed, _ = interaction_consumed(self.user_indices, self.item_indices)
user_consumed_set = {u: set(uis) for u, uis in user_consumed.items()}
return negatives_from_unconsumed(
user_consumed_set, users, items, n_items, num_neg
)
def __len__(self):
return len(self.labels)
def __getitem__(self, index):
"""Get a slice of data."""
return self.user_indices[index], self.item_indices[index], self.labels[index]