Source code for rankeval.dataset.dataset

# Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
# Authors: Salvatore Trani <salvatore.trani@isti.cnr.it>
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

"""
This module implements the generic class for loading/dumping a dataset from/to
file.
"""
import numpy as np
import numbers

from .svmlight_format import load_svmlight_file, dump_svmlight_file


[docs]class Dataset(object): """ This class describe the dataset object, with its utility and features Attributes ---------- X : numpy 2d array of float It is a dense numpy matrix of shape (n_instances, n_features), y : numpy 1d array of float It is a ndarray of shape (n_instances,) with the gold label query_ids : numpy 1d array of int It is a ndarray of shape(n_queries,) query_offsets : numpy 1d array of int It is a ndarray of shape(n_queries+1, ) with the start and end offsets of each query. In particular. the i-th query has indices ranging in [ query_offsets[i], query_offsets[i+1] ), with the latter excluded. name : str The name to give to the dataset n_instances : int The number of instances in the dataset n_features : int The number of features in the dataset n_queries : int The number of queries in the dataset """ def __init__(self, X, y, query_ids, name=None): """ This module implements the generic class for loading/dumping a dataset from/to file. Parameters ---------- X : numpy.ndarray The matrix with feature values y : numpy.array The vector with label values query_ids : numpy.array The vector with the query_id for each sample. """ if query_ids.size != X.shape[0]: raise Exception("query_ids argument has not the correct shape!") # convert from query_ids per sample to query offset self.query_ids, self.query_offsets = \ np.unique(query_ids, return_index=True) # resort the arrays per offset (if the file does not contains qids in # order, the np.unique will return qids with a different ordering... idx_sort = np.argsort(self.query_offsets) self.query_ids = self.query_ids[idx_sort] self.query_offsets = self.query_offsets[idx_sort] self.query_offsets = np.append(self.query_offsets, query_ids.size) self.X, self.y = X, y self.name = "Dataset %s" % (self.X.shape,) if name is not None: self.name = name self.n_instances = self.y.size self.n_features = self.X.shape[1] self.n_queries = self.query_ids.size
[docs] @staticmethod def load(f, name=None, format="svmlight"): """ This static method implements the loading of a dataset from file. Parameters ---------- f : path The file name of the dataset to load name : str The name to be given to the current dataset format : str The format of the dataset file to load (actually supported is only "svmlight" format) Returns ------- dataset : Dataset The dataset read from file """ if format == "svmlight": X, y, query_ids = load_svmlight_file(f, query_id=True) else: raise TypeError("Dataset format %s is not yet supported!" % format) return Dataset(X, y, query_ids, name)
[docs] def subset_features(self, features): """ Create a new Dataset with only the features identified by the given features parameters (indices). It is useful for performing feature selection. Parameters ---------- features : numpy array or list The indices of the features to select in the resulting dataset Returns ------- dataset : rankeval.dataset.Dataset The resulting dataset with the given subset of features """ return Dataset(self.X[:, features].copy(), self.y, self.get_qids_dataset(), name=self.name)
[docs] def dump(self, f, format): """ This method implements the writing of a previously loaded dataset according to the given format on file Parameters ---------- f : path The file path where to store the dataset format : str The format to use for dumping the dataset on file (actually supported is only "svmlight" format) """ # we need to unroll the query_ids and query_offsets. # They are represented compact: they report only the query ids and the # offsets where each query starts and ends. query_ids = np.ndarray(self.n_instances, dtype=np.int32) for qid, start_offset, end_offset in self.query_iterator(): for idx in np.arange(start_offset, end_offset): query_ids[idx] = qid if format == "svmlight": dump_svmlight_file(self.X, self.y, f, query_ids) else: raise TypeError("Dataset format %s is not yet supported!" % format)
[docs] def split(self, train_size, vali_size=0, random_state=None): """ This method splits the dataset into train/validation/test partition. It shuffle the query ids before partitioning. If vali_size=0, it means the method will not create a validation set, thus returning only train and test sets. Otherwise it will return train/vali/test sets. Parameters ---------- train_size : float The ratio of query ids in the training set. It should be between 0 and 1. vali_size : float The ratio of query ids in the validation set. It should be between 0 and 1. 0 means no validation to be created. random_state : int If int, random_state is the seed used by the random number generator. If RandomState instance, random_state is the random number generator. If None, the random number generator is the RandomState instance used by np.random. Returns ------- (train, vali, test) datasets : tuple of rankeval.dataset.Dataset The resulting datasets with the given fraction of query ids in each partition. """ if train_size < 0 or train_size > 1 or (train_size + vali_size) > 1: raise Exception("train and/or validation sizes are not correct!") train_qn = int(round(train_size * self.n_queries)) vali_qn = int(round(vali_size * self.n_queries)) test_qn = self.n_queries - train_qn - vali_qn qid_map = np.ndarray(self.n_instances, dtype=np.uint32) for qid, start_offset, end_offset in self.query_iterator(): for idx in np.arange(start_offset, end_offset): qid_map[idx] = qid # add queries shuffling rng = Dataset._check_random_state(random_state) qids_permutation = rng.permutation(self.query_ids) train_qid = qids_permutation[:train_qn] vali_qid = qids_permutation[train_qn:train_qn+vali_qn] test_qid = qids_permutation[-test_qn:] train_mask = np.in1d(qid_map, train_qid) vali_mask = np.in1d(qid_map, vali_qid) test_mask = np.in1d(qid_map, test_qid) train_dataset = Dataset(self.X[train_mask], self.y[train_mask], qid_map[train_mask], name=self.name + ' Train') if vali_size: vali_dataset = Dataset(self.X[vali_mask], self.y[vali_mask], qid_map[vali_mask], name=self.name + ' Vali') test_dataset = Dataset(self.X[test_mask], self.y[test_mask], qid_map[test_mask], name=self.name + ' Test') if not vali_size: return train_dataset, test_dataset else: return train_dataset, vali_dataset, test_dataset
[docs] def subset(self, query_ids, name=None): """ This method return a subset of the dataset according to the query_ids parameter. Parameters ---------- query_ids : numpy 1d array of int It is a ndarray with the query_ids to select name : str The name to give to the dataset Returns ------- datasets : rankeval.dataset.Dataset The resulting dataset with only the query_ids requested """ qid_map = np.ndarray(self.n_instances, dtype=np.uint32) for qid, start_offset, end_offset in self.query_iterator(): for idx in np.arange(start_offset, end_offset): qid_map[idx] = qid mask = np.in1d(qid_map, query_ids) return Dataset(self.X[mask], self.y[mask], qid_map[mask], name=name)
[docs] def clear_X(self): """ This method clears the space used by the dataset instance for storing X (the dataset features). This space is used only for scoring, thus it can be freed after. """ del self.X self.X = None
[docs] def query_iterator(self): """ This method implements and iterator over the offsets of the query_ids in the dataset. Returns ------- offsets : tuple of (int, int) The row index of instances belonging to the same query. The two indices represent (start, end) offsets. """ for i in np.arange(self.n_queries): yield self.query_ids[i], \ self.query_offsets[i], self.query_offsets[i+1]
[docs] def get_query_sizes(self): """ This method return the size of each query set. Returns ------- sizes : numpy 1d array of int It is a ndarray of shape (n_queries,) """ return np.ediff1d(self.query_offsets)
[docs] def get_qids_dataset(self): """ This method returns the query ids array in linear representation, i.e., with the qid of each instance. Useful for creating a new dataset starting from a different one. Returns ------- query_ids : numpy 1d array of int It is a ndarray of shape (n_instances,) """ query_ids = np.empty(shape=self.n_instances, dtype=np.int32) for qid, start_offset, end_offset in self.query_iterator(): query_ids[start_offset:end_offset] = qid return query_ids
@staticmethod def _check_random_state(seed): """ Turn seed into a np.random.RandomState instance (took for sklearn) Parameters ---------- seed : None | int | instance of RandomState If seed is None, return the RandomState singleton used by np.random. If seed is an int, return a new RandomState instance seeded with it. If seed is already a RandomState instance, return it. Otherwise raise ValueError. """ if seed is None or seed is np.random: return np.random.mtrand._rand if isinstance(seed, (numbers.Integral, np.integer)): return np.random.RandomState(seed) if isinstance(seed, np.random.RandomState): return seed raise ValueError('%r cannot be used to seed a numpy.random.RandomState' ' instance' % seed) def __str__(self): return self.name def __hash__(self): return int( sum(self.y[:100]) + sum(self.X[:100,0]) ) def __eq__(self, other): return (self.X == other.X).all() and \ (self.y == other.y).all() and \ (self.query_ids == other.query_ids).all() def __ne__(self, other): # Not strictly necessary, but to avoid having both x==y and x!=y # True at the same time return not(self == other)