"""Utility Functions for Evaluating Data."""
import functools
import math
import numbers
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss, mean_absolute_error, r2_score, roc_auc_score
from .computation import (
build_eval_transformed_data,
compute_preds,
compute_probs,
compute_recommends,
)
from .metrics import (
LISTWISE_METRICS,
POINTWISE_METRICS,
RANKING_METRICS,
RATING_METRICS,
average_precision_at_k,
balanced_accuracy,
listwise_scores,
ndcg_at_k,
pr_auc_score,
precision_at_k,
rec_coverage,
recall_at_k,
rmse,
roc_gauc_score,
)
from ..data import TransformedEvalSet
def _check_metrics(task, metrics, k):
if not isinstance(metrics, (list, tuple)):
metrics = [metrics]
if task == "rating":
for m in metrics:
if m not in RATING_METRICS:
raise ValueError(f"Metrics `{m}` is not suitable for rating task...")
elif task == "ranking":
for m in metrics:
if m not in RANKING_METRICS:
raise ValueError(f"Metrics `{m}` is not suitable for ranking task...")
if not isinstance(k, numbers.Integral):
raise TypeError("`k` must be integer")
return metrics
def sample_users(data, seed, num):
np_rng = np.random.default_rng(seed)
unique_users = list(data.positive_consumed)
if isinstance(num, numbers.Integral) and 0 < num < len(unique_users):
users = np_rng.choice(unique_users, num, replace=False).tolist()
else:
users = unique_users
return users
[docs]def evaluate(
model,
data,
neg_sampling,
eval_batch_size=8192,
metrics=None,
k=10,
sample_user_num=None,
seed=42,
):
"""Evaluate the model on specific data and metrics.
Parameters
----------
model : Base
Model for evaluation.
data : :class:`pandas.DataFrame` or :class:`~libreco.data.TransformedEvalSet`
Data to evaluate.
neg_sampling : bool
Whether to perform negative sampling for evaluating data.
eval_batch_size : int, default: 8192
Batch size used in evaluation.
metrics : list or None, default: None
List of metrics for evaluating.
k : int, default: 10
Parameter of metrics, e.g. recall at k, ndcg at k
sample_user_num : int or None, default: None
Number of users used in evaluating. By default, it will use all the users in eval_data.
Setting it to a positive number will sample users randomly from eval data.
seed : int, default: 42
Random seed.
Returns
-------
eval_results : dict of {str : float}
Evaluation results for the model and data.
Examples
--------
>>> eval_result = evaluate(model, data, neg_sampling=True, metrics=["roc_auc", "precision", "recall"])
"""
if not isinstance(data, (pd.DataFrame, TransformedEvalSet)):
raise ValueError("`data` must be `pandas.DataFrame` or `TransformedEvalSet`")
data = build_eval_transformed_data(model, data, neg_sampling, seed)
if not metrics:
metrics = ["loss"]
metrics = _check_metrics(model.task, metrics, k)
eval_result = dict()
if model.task == "rating":
y_pred, y_true = compute_preds(model, data, eval_batch_size)
for m in metrics:
if m in ["rmse", "loss"]:
eval_result[m] = rmse(y_true, y_pred)
elif m == "mae":
eval_result[m] = mean_absolute_error(y_true, y_pred)
elif m == "r2":
eval_result[m] = r2_score(y_true, y_pred)
else:
if POINTWISE_METRICS.intersection(metrics):
y_prob, y_true = compute_probs(model, data, eval_batch_size)
for m in metrics:
if m in ["log_loss", "loss"]:
eval_result[m] = log_loss(y_true, y_prob, eps=1e-7)
elif m == "balanced_accuracy":
eval_result[m] = balanced_accuracy(y_true, y_prob)
elif m == "roc_auc":
eval_result[m] = roc_auc_score(y_true, y_prob)
elif m == "roc_gauc":
eval_result[m] = roc_gauc_score(y_true, y_prob, data.user_indices)
elif m == "pr_auc":
eval_result[m] = pr_auc_score(y_true, y_prob)
if LISTWISE_METRICS.intersection(metrics):
users = sample_users(data, seed, sample_user_num)
num_batch_users = max(1, math.floor(eval_batch_size / model.n_items))
y_trues = data.positive_consumed
y_recos = compute_recommends(model, users, k, num_batch_users)
for m in metrics:
if m not in LISTWISE_METRICS:
continue
if m == "coverage":
eval_result[m] = rec_coverage(y_recos, users, model.n_items)
continue
elif m == "precision":
fn = precision_at_k
elif m == "recall":
fn = recall_at_k
elif m == "map":
fn = average_precision_at_k
elif m == "ndcg":
fn = ndcg_at_k
# noinspection PyUnboundLocalVariable
eval_result[m] = listwise_scores(fn, y_trues, y_recos, users, k)
return eval_result
def print_metrics(
model,
neg_sampling,
# train_data=None,
eval_data=None,
metrics=None,
eval_batch_size=8192,
k=10,
sample_user_num=2048,
seed=42,
):
loss_name = "rmse" if model.task == "rating" else "log_loss"
metrics_fn = functools.partial(
evaluate,
model=model,
neg_sampling=neg_sampling,
eval_batch_size=eval_batch_size,
k=k,
sample_user_num=sample_user_num,
seed=seed,
)
# if train_data:
# train_metrics = metrics_fn(data=train_data, metrics=[loss_name])
# print(f"\t train {loss_name}: {train_metrics[loss_name]:.4f}")
if eval_data:
eval_metrics = metrics_fn(data=eval_data, metrics=metrics)
for m, val in eval_metrics.items():
if m == "loss":
metric = loss_name
elif m in LISTWISE_METRICS:
metric = f"{m}@{k}"
else:
metric = m
str_val = f"{round(val, 2)}%" if m == "coverage" else f"{val:.4f}"
print(f"\t eval {metric}: {str_val}")