Source code for openbiolink.evaluation.evaluation


import time
from abc import ABC, abstractmethod
from typing import Dict, Tuple

import torch
from tqdm import tqdm

from openbiolink.evaluation.dataLoader import DataLoader


[docs]class Evaluator(ABC):
    """
        :param dl: Dataloader containing the OpenBioLink dataset
        :param higher_is_better: Boolean which should be set to `True` if higher scores are considered better, `False` otherwise.
    """

    def __init__(self, dl: DataLoader, higher_is_better: bool = True):
        self.dl: DataLoader = dl
        self.higher_is_better = higher_is_better

    def _get_ranking(self, y_pred_pos_head, y_pred_neg_head, y_pred_pos_tail, y_pred_neg_tail):
        if self.higher_is_better:
            ranking_head = torch.sum(y_pred_neg_head >= y_pred_pos_head.view(-1, 1), dim=1) + 1
            ranking_tail = torch.sum(y_pred_neg_tail >= y_pred_pos_tail.view(-1, 1), dim=1) + 1
        else:
            ranking_head = torch.sum(y_pred_neg_head <= y_pred_pos_head.view(-1, 1), dim=1) + 1
            ranking_tail = torch.sum(y_pred_neg_tail <= y_pred_pos_tail.view(-1, 1), dim=1) + 1
        ranking_list = torch.cat([ranking_head, ranking_tail], dim=0)
        return ranking_list

    def _get_result(self, ranking_lists: list):
        hits1 = 0.
        hits3 = 0.
        hits10 = 0.
        mrr = 0.
        count = 0
        for ranking_list in ranking_lists:
            hits1 = hits1 + (ranking_list <= 1).sum()
            hits3 = hits3 + (ranking_list <= 3).sum()
            hits10 = hits10 + (ranking_list <= 10).sum()
            mrr = mrr + (1. / ranking_list).sum()
            count = count + ranking_list.shape[0]
        return {'hits@1': hits1 / count,
                'hits@3': hits3 / count,
                'hits@10': hits10 / count,
                'mrr': mrr / count}

    def _evaluate_batch(self, batch):
        scores_head, scores_tail = self.score_batch(batch)
        pos_scores_head = scores_head.gather(1, batch[:, 0].view(-1, 1)).view(-1, 1)
        pos_scores_tail = scores_tail.gather(1, batch[:, 2].view(-1, 1)).view(-1, 1)
        neg_scores_head = self.dl.filter_scores(
            batch,
            0,
            scores_head,
            float('nan') if self.higher_is_better else float('Inf')
        )
        neg_scores_tail = self.dl.filter_scores(
            batch,
            2,
            scores_tail,
            float('nan') if self.higher_is_better else float('Inf')
        )
        return self._get_ranking(pos_scores_head, neg_scores_head, pos_scores_tail, neg_scores_tail)

[docs]    def evaluate(self, batch_size=100) -> Dict[str, float]:
        """Evaluates a model by retrieving scores from the (implemented) score_batch function.

        :param batch_size:
            Integer determining the size of the test batch which is passed to function `score_batch`

        :return:
            Dictionary containing the evaluation results (keys: 'hits@1', 'hits@3', 'hits@10', 'mrr')
        """

        start = time.time()
        n_batches, batches = self.dl.get_test_batches(batch_size)

        result = []
        for batch in tqdm(batches, total=n_batches):
            result.append(self._evaluate_batch(batch))

        print('Evaluation took {:.3f} seconds'.format(time.time() - start))
        return self._get_result(result)

[docs]    @abstractmethod
    def score_batch(self, batch: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """Abstract function, has to be implemented. Should return two arrays containing the head and tail scores of a batch of test data from a model.

        :param batch:
            Batch of test data. Shape `(batch_size,3)`

        :return:
            + head_scores: `torch.tensor` where the value at [i,j] is the score of the triple `(j, batch[i][1], batch[i][2])`. Shape `(batch_size, num_entities)`
            + tail_scores: `torch.tensor` where the value at [i,j] is the score of the triple `(batch[i][0], batch[i][1], j)`. Shape `(batch_size, num_entities)`
        """
        raise NotImplementedError