Source code for openbiolink.evaluation.evaluation


import time
from abc import ABC, abstractmethod
from typing import Dict, Tuple

import torch
from tqdm import tqdm

from openbiolink.evaluation.dataLoader import DataLoader


[docs]class Evaluator(ABC): """ :param dl: Dataloader containing the OpenBioLink dataset :param higher_is_better: Boolean which should be set to `True` if higher scores are considered better, `False` otherwise. """ def __init__(self, dl: DataLoader, higher_is_better: bool = True): self.dl: DataLoader = dl self.higher_is_better = higher_is_better def _get_ranking(self, y_pred_pos_head, y_pred_neg_head, y_pred_pos_tail, y_pred_neg_tail): if self.higher_is_better: ranking_head = torch.sum(y_pred_neg_head >= y_pred_pos_head.view(-1, 1), dim=1) + 1 ranking_tail = torch.sum(y_pred_neg_tail >= y_pred_pos_tail.view(-1, 1), dim=1) + 1 else: ranking_head = torch.sum(y_pred_neg_head <= y_pred_pos_head.view(-1, 1), dim=1) + 1 ranking_tail = torch.sum(y_pred_neg_tail <= y_pred_pos_tail.view(-1, 1), dim=1) + 1 ranking_list = torch.cat([ranking_head, ranking_tail], dim=0) return ranking_list def _get_result(self, ranking_lists: list): hits1 = 0. hits3 = 0. hits10 = 0. mrr = 0. count = 0 for ranking_list in ranking_lists: hits1 = hits1 + (ranking_list <= 1).sum() hits3 = hits3 + (ranking_list <= 3).sum() hits10 = hits10 + (ranking_list <= 10).sum() mrr = mrr + (1. / ranking_list).sum() count = count + ranking_list.shape[0] return {'hits@1': hits1 / count, 'hits@3': hits3 / count, 'hits@10': hits10 / count, 'mrr': mrr / count} def _evaluate_batch(self, batch): scores_head, scores_tail = self.score_batch(batch) pos_scores_head = scores_head.gather(1, batch[:, 0].view(-1, 1)).view(-1, 1) pos_scores_tail = scores_tail.gather(1, batch[:, 2].view(-1, 1)).view(-1, 1) neg_scores_head = self.dl.filter_scores( batch, 0, scores_head, float('nan') if self.higher_is_better else float('Inf') ) neg_scores_tail = self.dl.filter_scores( batch, 2, scores_tail, float('nan') if self.higher_is_better else float('Inf') ) return self._get_ranking(pos_scores_head, neg_scores_head, pos_scores_tail, neg_scores_tail)
[docs] def evaluate(self, batch_size=100) -> Dict[str, float]: """Evaluates a model by retrieving scores from the (implemented) score_batch function. :param batch_size: Integer determining the size of the test batch which is passed to function `score_batch` :return: Dictionary containing the evaluation results (keys: 'hits@1', 'hits@3', 'hits@10', 'mrr') """ start = time.time() n_batches, batches = self.dl.get_test_batches(batch_size) result = [] for batch in tqdm(batches, total=n_batches): result.append(self._evaluate_batch(batch)) print('Evaluation took {:.3f} seconds'.format(time.time() - start)) return self._get_result(result)
[docs] @abstractmethod def score_batch(self, batch: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """Abstract function, has to be implemented. Should return two arrays containing the head and tail scores of a batch of test data from a model. :param batch: Batch of test data. Shape `(batch_size,3)` :return: + head_scores: `torch.tensor` where the value at [i,j] is the score of the triple `(j, batch[i][1], batch[i][2])`. Shape `(batch_size, num_entities)` + tail_scores: `torch.tensor` where the value at [i,j] is the score of the triple `(batch[i][0], batch[i][1], j)`. Shape `(batch_size, num_entities)` """ raise NotImplementedError