# -*- coding: utf-8 -*-
import logging
import numpy as np
import pandas as pd
import utool as ut
from wbia.algo.verif import pairfeat
from wbia.algo.verif import sklearn_utils
import vtool as vt
# import itertools as it
# from os.path import join
print, rrr, profile = ut.inject2(__name__)
logger = logging.getLogger('wbia')
[docs]@ut.reloadable_class
class BaseVerifier(ut.NiceRepr):
def __nice__(verif):
return '.'.join([verif.metadata['task_key'], verif.metadata['clf_key']])
[docs] def predict_proba_df(verif, edges):
raise NotImplementedError('abstract')
[docs] def fit(verif, edges):
"""
The vsone.OneVsOneProblem currently handles fitting a model based on
edges. The actual fit call is in clf_helpers.py
"""
raise NotImplementedError('Need to use OneVsOneProblem to do this')
[docs] def predict(verif, edges, method='argmax', encoded=False):
probs = verif.predict_proba_df(edges)
target_names = verif.class_names
pred_enc = sklearn_utils.predict_from_probs(
probs, method=method, target_names=target_names
)
if encoded:
pred = pred_enc
else:
pred = pred_enc.apply(verif.class_names.__getitem__)
return pred
[docs] def easiness(verif, edges, real):
"""
Gets the probability of the class each edge is labeled as. Indicates
how easy it is to classify this example.
"""
probs = verif.predict_proba_df(edges)
target_names = probs.columns.tolist()
real_enc = np.array([target_names.index(r) for r in real])
easiness = np.array(ut.ziptake(probs.values, real_enc))
# easiness = pd.Series(easiness, index=probs.index)
return easiness
[docs]@ut.reloadable_class
class Verifier(BaseVerifier):
"""
Notes:
deploy_info should be a dict with the following keys:
clf: sklearn classifier
metadata: another dict with key:
class_names - classes that clf predicts
task_key - str
clf_key - str
data_info - tuple of (feat_extract_config, feat_dims) # TODO: make feat dims part of feat_extract_config defaulted to None
data_info - tuple of (feat_extract_config, feat_dims)
Example:
>>> # DISABLE_DOCTEST
>>> from wbia.algo.verif.vsone import * # NOQA
>>> import wbia
>>> ibs = wbia.opendb('PZ_MTEST')
>>> speceis = 'zebra_plains'
>>> task_key = 'match_state'
>>> verif = Deployer()._load_published(ibs, species, task_key)
"""
def __init__(verif, ibs=None, deploy_info=None):
verif.ibs = ibs
verif.clf = None
verif.metadata = None
verif.class_names = None
verif.extr = None
if deploy_info:
verif.clf = deploy_info['clf']
verif.metadata = deploy_info['metadata']
verif.class_names = verif.metadata['class_names']
data_info = verif.metadata['data_info']
feat_extract_config, feat_dims = data_info
feat_extract_config = feat_extract_config.copy()
feat_extract_config['feat_dims'] = feat_dims
verif.extr = pairfeat.PairwiseFeatureExtractor(
ibs, config=feat_extract_config
)
[docs] def predict_proba_df(verif, edges):
# TODO: if multiple verifiers have the same feature extractor we should
# be able to cache it before we run the verification algo.
# (we used to do this)
X_df = verif.extr.transform(edges)
probs_df = sklearn_utils.predict_proba_df(verif.clf, X_df, verif.class_names)
return probs_df
# prev_data_info = None
# task_keys = list(infr.verifiers.keys())
# task_probs = {}
# for task_key in task_keys:
# deploy_info = infr.verifiers[task_key]
# data_info = deploy_info['metadata']['data_info']
# class_names = deploy_info['metadata']['class_names']
# clf = deploy_info['clf']
# if prev_data_info != data_info:
# X_df = infr._cached_pairwise_features(edges, data_info)
# prev_data_info = data_info
# probs_df = sklearn_utils.predict_proba_df(clf, X_df, class_names)
# task_probs[task_key] = probs_df
[docs]@ut.reloadable_class
class IntraVerifier(BaseVerifier):
"""
Predicts cross-validated intra-training sample probs.
Note:
Requires the original OneVsOneProblem object.
This classifier is for intra-dataset evaulation and is not meant to be
pushlished for use on external datasets.
"""
def __init__(verif, pblm, task_key, clf_key, data_key):
verif.pblm = pblm
verif.task_key = task_key
verif.clf_key = clf_key
verif.data_key = data_key
verif.metadata = {
'task_key': task_key,
'clf_key': clf_key,
}
# Make an ensemble of the evaluation classifiers
from wbia.algo.verif import deploy
deployer = deploy.Deployer(pblm=verif.pblm)
verif.ensemble = deployer._make_ensemble_verifier(
verif.task_key, verif.clf_key, verif.data_key
)
verif.class_names = verif.ensemble.class_names
[docs] def predict_proba_df(verif, want_edges):
"""
Predicts task probabilities in one of two ways:
(1) if the edge was in the training set then its cross-validated
probability is returned.
(2) if the edge was not in the training set, then the average
prediction over all cross validated classifiers are used.
"""
clf_key = verif.clf_key
task_key = verif.task_key
data_key = verif.data_key
pblm = verif.pblm
# Load pre-predicted probabilities for intra-training set edges
res = pblm.task_combo_res[task_key][clf_key][data_key]
# Normalize and align combined result sample edges
train_uv = np.array(res.probs_df.index.tolist())
assert np.all(
train_uv.T[0] < train_uv.T[1]
), 'edges must be in lower triangular form'
assert len(vt.unique_row_indexes(train_uv)) == len(
train_uv
), 'edges must be unique'
assert sorted(ut.emap(tuple, train_uv.tolist())) == sorted(
ut.emap(tuple, pblm.samples.aid_pairs.tolist())
)
want_uv = np.array(want_edges)
# Determine which edges need/have probabilities
want_uv_, train_uv_ = vt.structure_rows(want_uv, train_uv)
unordered_have_uv_ = np.intersect1d(want_uv_, train_uv_)
need_uv_ = np.setdiff1d(want_uv_, unordered_have_uv_)
flags = vt.flag_intersection(train_uv_, unordered_have_uv_)
# Re-order have_edges to agree with test_idx
have_uv_ = train_uv_[flags]
need_uv, have_uv = vt.unstructure_rows(need_uv_, have_uv_)
# Convert to tuples for pandas lookup. bleh...
have_edges = ut.emap(tuple, have_uv.tolist())
need_edges = ut.emap(tuple, need_uv.tolist())
want_edges = ut.emap(tuple, want_uv.tolist())
assert set(have_edges) & set(need_edges) == set([])
assert set(have_edges) | set(need_edges) == set(want_edges)
# Predict on unseen edges using an ensemble of evaluation classifiers
logger.info('Predicting %s probabilities' % (task_key,))
eclf_probs = verif.ensemble.predict_proba_df(need_edges)
# Combine probabilities --- get probabilites for each sample
# edges = have_edges + need_edges
have_probs = res.probs_df.loc[have_edges]
assert (
have_probs.index.intersection(eclf_probs.index).size == 0
), 'training (have) data was not disjoint from new (want) data '
probs = pd.concat([have_probs, eclf_probs])
return probs