Source code for wbia.algo.graph.mixin_matching

# -*- coding: utf-8 -*-
import logging
import numpy as np
import utool as ut
import pandas as pd
import itertools as it
import networkx as nx
import vtool as vt
from os.path import join  # NOQA
from wbia.algo.graph import nx_utils as nxu
from wbia.algo.graph.nx_utils import e_
from wbia.algo.graph.state import POSTV, NEGTV, INCMP, UNREV  # NOQA
from concurrent import futures
import tqdm


print, rrr, profile = ut.inject2(__name__)
logger = logging.getLogger('wbia')


def _cm_breaking_worker(cm_list, review_cfg={}, scoring='annot'):

    ranks_top = review_cfg.get('ranks_top', None)
    ranks_bot = review_cfg.get('ranks_bot', None)

    # Construct K-broken graph
    edges = []

    if ranks_bot is None:
        ranks_bot = 0

    scoring = scoring.lower()
    assert scoring in ['annot', 'name']
    assert ranks_bot == 0

    for count, cm in enumerate(cm_list):
        score_list = cm.annot_score_list

        # rank_list = ut.argsort(score_list)[::-1]
        # sortx = ut.argsort(rank_list)

        # top_sortx = sortx[:ranks_top]
        # bot_sortx = sortx[len(sortx) - ranks_bot :]
        # short_sortx = ut.unique(top_sortx + bot_sortx)
        # daid_list = ut.take(cm.daid_list, short_sortx)

        values = sorted(zip(score_list, cm.daid_list))[::-1]
        keep = values[:ranks_top]
        daid_list = ut.take_column(keep, 1)
        for daid in daid_list:
            u, v = (cm.qaid, daid)
            if v < u:
                u, v = v, u
            edges.append((u, v))
    return edges


def _make_rankings_worker(args):
    import wbia  # NOQA
    import tqdm  # NOQA

    (
        dbdir,
        qaids_chunk,
        daids,
        cfgdict,
        custom_nid_lookup,
        verbose,
        use_cache,
        invalidate_supercache,
        ranks_top,
    ) = args

    ibs = wbia.opendb(dbdir=dbdir)

    edges = set([])
    for qaids in tqdm.tqdm(qaids_chunk):

        qreq_ = ibs.new_query_request(
            [qaids],
            daids,
            cfgdict=cfgdict,
            custom_nid_lookup=custom_nid_lookup,
            verbose=verbose,
        )

        cm_list = qreq_.execute(
            prog_hook=None,
            use_cache=use_cache,
            invalidate_supercache=False,
        )

        new_edges = set(_cm_breaking_worker(cm_list, review_cfg={'ranks_top': ranks_top}))
        edges = edges | new_edges

    return edges


[docs]@ut.reloadable_class
class AnnotInfrMatching(object):
    """
    Methods for running matching algorithms
    """

[docs]    @profile
    def exec_matching(
        infr,
        qaids=None,
        daids=None,
        prog_hook=None,
        cfgdict=None,
        name_method='node',
        use_cache=True,
        invalidate_supercache=False,
        batch_size=None,
        ranks_top=5,
    ):
        """
        Loads chip matches into the inference structure
        Uses graph name labeling and ignores wbia labeling
        """
        return infr._make_rankings(
            qaids,
            daids,
            prog_hook,
            cfgdict,
            name_method,
            use_cache=use_cache,
            invalidate_supercache=invalidate_supercache,
            batch_size=batch_size,
            ranks_top=ranks_top,
        )

    def _set_vsmany_info(infr, qreq_, cm_list):
        infr.vsmany_qreq_ = qreq_
        infr.vsmany_cm_list = cm_list
        infr.cm_list = cm_list
        infr.qreq_ = qreq_

    def _make_rankings(
        infr,
        qaids=None,
        daids=None,
        prog_hook=None,
        cfgdict=None,
        name_method='node',
        use_cache=None,
        invalidate_supercache=None,
        batch_size=None,
        ranks_top=5,
    ):
        # from wbia.algo.graph import graph_iden

        # TODO: expose other ranking algos like SMK
        rank_algo = 'LNBNN'
        infr.print('Exec {} ranking algorithm'.format(rank_algo), 1)
        ibs = infr.ibs
        if qaids is None:
            qaids = infr.aids
        qaids = ut.ensure_iterable(qaids)
        if daids is None:
            daids = infr.aids
        if cfgdict is None:
            cfgdict = {
                # 'can_match_samename': False,
                'can_match_samename': True,
                'can_match_sameimg': True,
                # 'augment_queryside_hack': True,
                'K': 3,
                'Knorm': 3,
                'prescore_method': 'csum',
                'score_method': 'csum',
            }
        cfgdict.update(infr.ranker_params)
        infr.print('Using LNBNN config = %r' % (cfgdict,))
        # hack for using current nids
        if name_method == 'node':
            aids = sorted(set(ut.aslist(qaids) + ut.aslist(daids)))
            custom_nid_lookup = infr.get_node_attrs('name_label', aids)
        elif name_method == 'edge':
            custom_nid_lookup = {
                aid: nid for nid, cc in infr.pos_graph._ccs.items() for aid in cc
            }
        elif name_method == 'wbia':
            custom_nid_lookup = None
        else:
            raise KeyError('Unknown name_method={}'.format(name_method))
        verbose = infr.verbose >= 2

        # <HACK FOR PIE V2>
        if cfgdict.get('pipeline_root', None) in ['PieTwo']:
            from wbia_pie_v2._plugin import distance_to_score

            globals().update(locals())

            edges = []

            for qaid in tqdm.tqdm(qaids):
                daids_ = list(set(daids) - set([qaid]))
                pie_annot_distances = ibs.pie_v2_predict_light_distance(
                    qaid,
                    daids_,
                )
                score_list = [
                    distance_to_score(pie_annot_distance, norm=500.0)
                    for pie_annot_distance in pie_annot_distances
                ]
                values = sorted(zip(score_list, daids_))[::-1]
                keep = values[:ranks_top]
                daid_list = ut.take_column(keep, 1)
                for daid in daid_list:
                    u, v = (qaid, daid)
                    if v < u:
                        u, v = v, u
                    edges.append((u, v))

            edges = set(edges)
            return edges
        # </HACK>

        if batch_size is not None:
            qaids_chunks = list(ut.ichunks(qaids, batch_size))
            num_chunks = len(qaids_chunks)

            arg_iter = list(
                zip(
                    [ibs.dbdir] * num_chunks,
                    qaids_chunks,
                    [daids] * num_chunks,
                    [cfgdict] * num_chunks,
                    [custom_nid_lookup] * num_chunks,
                    [verbose] * num_chunks,
                    [use_cache] * num_chunks,
                    [invalidate_supercache] * num_chunks,
                    [ranks_top] * num_chunks,
                )
            )

            nprocs = 8
            logger.info('Creating %d processes' % (nprocs,))
            executor = futures.ThreadPoolExecutor(nprocs)
            logger.info('Submitting workers')
            fs_chunk = []
            for args in ut.ProgIter(arg_iter, lbl='submit matching threads'):
                fs = executor.submit(_make_rankings_worker, args)
                fs_chunk.append(fs)

            results = []
            try:
                for fs in ut.ProgIter(fs_chunk, lbl='getting matching result'):
                    result = fs.result()
                    results.append(result)
            except Exception:
                raise
            finally:
                executor.shutdown(wait=True)

            assert len(results) == num_chunks
            edges = set(ut.flatten(results))
        else:
            qreq_ = ibs.new_query_request(
                qaids,
                daids,
                cfgdict=cfgdict,
                custom_nid_lookup=custom_nid_lookup,
                verbose=infr.verbose >= 2,
            )

            # cacher = qreq_.get_big_cacher()
            # if not cacher.exists():
            #     pass
            #     # import sys
            #     # sys.exit(1)

            cm_list = qreq_.execute(
                prog_hook=prog_hook,
                use_cache=use_cache,
                invalidate_supercache=invalidate_supercache,
            )
            infr._set_vsmany_info(qreq_, cm_list)

            edges = set(_cm_breaking_worker(cm_list, review_cfg={'ranks_top': ranks_top}))

        return edges
        # return cm_list

    def _make_matches_from(infr, edges, config=None, prog_hook=None):
        from wbia.algo.verif import pairfeat

        if config is None:
            config = infr.verifier_params
        extr = pairfeat.PairwiseFeatureExtractor(infr.ibs, config=config)
        match_list = extr._exec_pairwise_match(edges, prog_hook=prog_hook)
        return match_list

[docs]    def exec_vsone_subset(infr, edges, prog_hook=None):
        r"""
        Args:
            prog_hook (None): (default = None)

        CommandLine:
            python -m wbia.algo.graph.core exec_vsone_subset

        Example:
            >>> # ENABLE_DOCTEST
            >>> from wbia.algo.graph.core import *  # NOQA
            >>> infr = testdata_infr('testdb1')
            >>> infr.ensure_full()
            >>> edges = [(1, 2), (2, 3)]
            >>> result = infr.exec_vsone_subset(edges)
            >>> print(result)
        """
        match_list = infr._make_matches_from(edges, prog_hook)

        # TODO: is this code necessary anymore?
        vsone_matches = {e_(u, v): match for (u, v), match in zip(edges, match_list)}
        infr.vsone_matches.update(vsone_matches)
        edge_to_score = {e: match.fs.sum() for e, match in vsone_matches.items()}
        infr.graph.add_edges_from(edge_to_score.keys())
        infr.set_edge_attrs('score', edge_to_score)
        return match_list

[docs]    def lookup_cm(infr, aid1, aid2):
        """
        Get chipmatch object associated with an edge if one exists.
        """
        if infr.cm_list is None:
            return None, aid1, aid2
        # TODO: keep chip matches in dictionary by default?
        aid2_idx = ut.make_index_lookup([cm.qaid for cm in infr.cm_list])
        switch_order = False

        if aid1 in aid2_idx:
            idx = aid2_idx[aid1]
            cm = infr.cm_list[idx]
            if aid2 not in cm.daid2_idx:
                switch_order = True
                # raise KeyError('switch order')
        else:
            switch_order = True

        if switch_order:
            # switch order
            aid1, aid2 = aid2, aid1
            idx = aid2_idx[aid1]
            cm = infr.cm_list[idx]
            if aid2 not in cm.daid2_idx:
                raise KeyError('No ChipMatch for edge (%r, %r)' % (aid1, aid2))
        return cm, aid1, aid2

[docs]    @profile
    def apply_match_edges(infr, review_cfg={}):
        """
        Adds results from one-vs-many rankings as edges in the graph
        """
        if infr.cm_list is None:
            infr.print('apply_match_edges - matching has not been run!')
            return
        infr.print('apply_match_edges', 1)
        edges = infr._cm_breaking(review_cfg)
        # Create match-based graph structure
        infr.print('apply_match_edges adding %d edges' % len(edges), 1)
        infr.graph.add_edges_from(edges)
        infr.apply_match_scores()

    def _cm_breaking(infr, cm_list=None, review_cfg={}, scoring='annot'):
        """
        >>> from wbia.algo.graph.core import *  # NOQA
        >>> review_cfg = {}
        """
        if cm_list is None:
            cm_list = infr.cm_list
        return _cm_breaking_worker(
            cm_list=cm_list, review_cfg=review_cfg, scoring=scoring
        )

    def _cm_training_pairs(
        infr,
        qreq_=None,
        cm_list=None,
        top_gt=2,
        mid_gt=2,
        bot_gt=2,
        top_gf=2,
        mid_gf=2,
        bot_gf=2,
        rand_gt=2,
        rand_gf=2,
        rng=None,
    ):
        """
        Constructs training data for a pairwise classifier

        CommandLine:
            python -m wbia.algo.graph.core _cm_training_pairs

        Example:
            >>> # xdoctest: +REQUIRES(--slow)
            >>> # ENABLE_DOCTEST
            >>> from wbia.algo.graph.core import *  # NOQA
            >>> infr = testdata_infr('PZ_MTEST')
            >>> infr.exec_matching(cfgdict={
            >>>     'can_match_samename': True,
            >>>     'K': 4,
            >>>     'Knorm': 1,
            >>>     'prescore_method': 'csum',
            >>>     'score_method': 'csum'
            >>> })
            >>> from wbia.algo.graph.core import *  # NOQA
            >>> exec(ut.execstr_funckw(infr._cm_training_pairs))
            >>> rng = np.random.RandomState(42)
            >>> aid_pairs = np.array(infr._cm_training_pairs(rng=rng))
            >>> print(len(aid_pairs))
            >>> assert np.sum(aid_pairs.T[0] == aid_pairs.T[1]) == 0
        """
        if qreq_ is None:
            cm_list = infr.cm_list
            qreq_ = infr.qreq_
        ibs = infr.ibs
        aid_pairs = []
        dnids = qreq_.get_qreq_annot_nids(qreq_.daids)
        # dnids = qreq_.get_qreq_annot_nids(qreq_.daids)
        rng = ut.ensure_rng(rng)
        for cm in ut.ProgIter(cm_list, lbl='building pairs'):
            all_gt_aids = cm.get_top_gt_aids(ibs)
            all_gf_aids = cm.get_top_gf_aids(ibs)
            gt_aids = ut.take_percentile_parts(all_gt_aids, top_gt, mid_gt, bot_gt)
            gf_aids = ut.take_percentile_parts(all_gf_aids, top_gf, mid_gf, bot_gf)
            # get unscored examples
            unscored_gt_aids = [
                aid for aid in qreq_.daids[cm.qnid == dnids] if aid not in cm.daid2_idx
            ]
            rand_gt_aids = ut.random_sample(unscored_gt_aids, rand_gt, rng=rng)
            # gf_aids = cm.get_groundfalse_daids()
            _gf_aids = qreq_.daids[cm.qnid != dnids]
            _gf_aids = qreq_.daids.compress(cm.qnid != dnids)
            # gf_aids = ibs.get_annot_groundfalse(cm.qaid, daid_list=qreq_.daids)
            rand_gf_aids = ut.random_sample(_gf_aids, rand_gf, rng=rng).tolist()
            chosen_daids = ut.unique(gt_aids + gf_aids + rand_gf_aids + rand_gt_aids)
            aid_pairs.extend([(cm.qaid, aid) for aid in chosen_daids if cm.qaid != aid])

        return aid_pairs

    def _get_cm_agg_aid_ranking(infr, cc):
        aid_to_cm = {cm.qaid: cm for cm in infr.cm_list}
        all_scores = ut.ddict(list)
        for qaid in cc:
            cm = aid_to_cm[qaid]
            # should we be doing nids?
            for daid, score in zip(cm.get_top_aids(), cm.get_top_scores()):
                all_scores[daid].append(score)

        max_scores = sorted((max(scores), aid) for aid, scores in all_scores.items())[
            ::-1
        ]
        ranked_aids = ut.take_column(max_scores, 1)
        return ranked_aids

    def _get_cm_edge_data(infr, edges, cm_list=None):
        symmetric = True

        if cm_list is None:
            cm_list = infr.cm_list
        # Find scores for the edges that exist in the graph
        edge_to_data = ut.ddict(dict)
        aid_to_cm = {cm.qaid: cm for cm in cm_list}
        for u, v in edges:
            if symmetric:
                u, v = e_(u, v)
            cm1 = aid_to_cm.get(u, None)
            cm2 = aid_to_cm.get(v, None)
            scores = []
            ranks = []
            for cm in ut.filter_Nones([cm1, cm2]):
                for aid in [u, v]:
                    idx = cm.daid2_idx.get(aid, None)
                    if idx is None:
                        continue
                    score = cm.annot_score_list[idx]
                    rank = cm.get_annot_ranks([aid])[0]
                    scores.append(score)
                    ranks.append(rank)
            if len(scores) == 0:
                score = None
                rank = None
            else:
                # Choose whichever one gave the best score
                idx = vt.safe_argmax(scores, nans=False)
                score = scores[idx]
                rank = ranks[idx]
            edge_to_data[(u, v)]['score'] = score
            edge_to_data[(u, v)]['rank'] = rank
        return edge_to_data

[docs]    @profile
    def apply_match_scores(infr):
        """

        Applies precomputed matching scores to edges that already exist in the
        graph. Typically you should run infr.apply_match_edges() before running
        this.

        CommandLine:
            python -m wbia.algo.graph.core apply_match_scores --show

        Example:
            >>> # xdoctest: +REQUIRES(--slow)
            >>> # ENABLE_DOCTEST
            >>> from wbia.algo.graph.core import *  # NOQA
            >>> infr = testdata_infr('PZ_MTEST')
            >>> infr.exec_matching()
            >>> infr.apply_match_edges()
            >>> infr.apply_match_scores()
            >>> infr.get_edge_attrs('score')
        """
        if infr.cm_list is None:
            infr.print('apply_match_scores - no scores to apply!')
            return
        infr.print('apply_match_scores', 1)
        edges = list(infr.graph.edges())
        edge_to_data = infr._get_cm_edge_data(edges)

        # Remove existing attrs
        ut.nx_delete_edge_attr(infr.graph, 'score')
        ut.nx_delete_edge_attr(infr.graph, 'rank')
        ut.nx_delete_edge_attr(infr.graph, 'normscore')

        edges = list(edge_to_data.keys())
        edge_scores = list(ut.take_column(edge_to_data.values(), 'score'))
        edge_scores = ut.replace_nones(edge_scores, np.nan)
        edge_scores = np.array(edge_scores)
        edge_ranks = np.array(ut.take_column(edge_to_data.values(), 'rank'))
        # take the inf-norm
        normscores = edge_scores / vt.safe_max(edge_scores, nans=False)

        # Add new attrs
        infr.set_edge_attrs('score', ut.dzip(edges, edge_scores))
        infr.set_edge_attrs('rank', ut.dzip(edges, edge_ranks))

        # Hack away zero probabilites
        # probs = np.vstack([p_nomatch, p_match, p_notcomp]).T + 1e-9
        # probs = vt.normalize(probs, axis=1, ord=1, out=probs)
        # entropy = -(np.log2(probs) * probs).sum(axis=1)
        infr.set_edge_attrs('normscore', dict(zip(edges, normscores)))


[docs]class InfrLearning(object):
[docs]    def learn_deploy_verifiers(infr, publish=False):
        """
        Uses current knowledge to train verifiers for new unseen pairs.

        Example:
            >>> # DISABLE_DOCTEST
            >>> import wbia
            >>> ibs = wbia.opendb('PZ_MTEST')
            >>> infr = wbia.AnnotInference(ibs, aids='all')
            >>> infr.ensure_mst()
            >>> publish = False
            >>> infr.learn_deploy_verifiers()

        Ignore:
            publish = True
        """
        infr.print('learn_deploy_verifiers')
        from wbia.algo.verif import vsone

        pblm = vsone.OneVsOneProblem(infr, verbose=True)
        pblm.primary_task_key = 'match_state'
        pblm.default_clf_key = 'RF'
        pblm.default_data_key = 'learn(sum,glob)'
        pblm.setup()
        dpath = '.'

        task_key = 'match_state'
        pblm.deploy(dpath, task_key=task_key, publish=publish)

        task_key = 'photobomb_state'
        if task_key in pblm.eval_task_keys:
            pblm.deploy(dpath, task_key=task_key)

[docs]    def learn_evaluation_verifiers(infr):
        """
        Creates a cross-validated ensemble of classifiers to evaluate
        verifier error cases and groundtruth errors.

        CommandLine:
            python -m wbia.algo.graph.mixin_matching learn_evaluation_verifiers

        Doctest:
            >>> # xdoctest: +REQUIRES(module:wbia_cnn, --slow)
            >>> import wbia
            >>> infr = wbia.AnnotInference(
            >>>     'PZ_MTEST', aids='all', autoinit='annotmatch',
            >>>     verbose=4)
            >>> verifiers = infr.learn_evaluation_verifiers()
            >>> edges = list(infr.edges())
            >>> verif = verifiers['match_state']
            >>> probs = verif.predict_proba_df(edges)
            >>> print(probs)
        """
        infr.print('learn_evaluataion_verifiers')
        from wbia.algo.verif import vsone

        pblm = vsone.OneVsOneProblem(infr, verbose=5)
        pblm.primary_task_key = 'match_state'
        pblm.eval_clf_keys = ['RF']
        pblm.eval_data_keys = ['learn(sum,glob)']
        pblm.setup_evaluation()
        if True:
            pblm.report_evaluation()
        verifiers = pblm._make_evaluation_verifiers(pblm.eval_task_keys)
        return verifiers

[docs]    def load_published(infr):
        """
        Downloads, caches, and loads pre-trained verifiers.
        This is the default action.
        """
        from wbia.algo.verif import deploy

        ibs = infr.ibs
        species = ibs.get_primary_database_species(infr.aids)
        infr.print('Loading task_thresh for species: %r' % (species,))
        assert species in infr.task_thresh_dict
        infr.task_thresh = infr.task_thresh_dict[species]
        infr.print('infr.task_thresh: %r' % (infr.task_thresh,))
        infr.print('Loading verifiers for species: %r' % (species,))
        try:
            infr.verifiers = deploy.Deployer().load_published(ibs, species)
            message = 'Loaded verifiers %r' % (infr.verifiers,)
            infr.print(message)
        except TypeError as ex:
            message = 'Error: Failed to load verifiers for %r' % (species,)
            ut.printex(
                ex,
                message,
                iswarning=True,
                tb=True,
            )
            infr.print(message)

[docs]    def load_latest_classifiers(infr, dpath):
        from wbia.algo.verif import deploy

        task_clf_fpaths = deploy.Deployer(dpath).find_latest_local()
        classifiers = {}
        for task_key, fpath in task_clf_fpaths.items():
            clf_info = ut.load_data(fpath)
            assert (
                clf_info['metadata']['task_key'] == task_key
            ), 'bad saved clf at fpath={}'.format(fpath)
            classifiers[task_key] = clf_info
        infr.verifiers = classifiers
        # return classifiers

[docs]    def photobomb_samples(infr):
        edges = list(infr.edges())
        tags_list = list(infr.gen_edge_values('tags', edges=edges, default=[]))
        flags = ut.filterflags_general_tags(tags_list, has_any=['photobomb'])
        pb_edges = ut.compress(edges, flags)
        return pb_edges


class _RedundancyAugmentation(object):

    # def rand_neg_check_edges(infr, c1_nodes, c2_nodes):
    #     """
    #     Find enough edges to between two pccs to make them k-negative complete
    #     """
    #     k = infr.params['redun.neg']
    #     existing_edges = nxu.edges_cross(infr.graph, c1_nodes, c2_nodes)
    #     reviewed_edges = {
    #         edge: state
    #         for edge, state in infr.get_edge_attrs(
    #             'decision', existing_edges,
    #             default=UNREV).items()
    #         if state != UNREV
    #     }
    #     n_neg = sum([state == NEGTV for state in reviewed_edges.values()])
    #     if n_neg < k:
    #         # Find k random negative edges
    #         check_edges = existing_edges - set(reviewed_edges)
    #         if len(check_edges) < k:
    #             edges = it.starmap(nxu.e_, it.product(c1_nodes, c2_nodes))
    #             for edge in edges:
    #                 if edge not in reviewed_edges:
    #                     check_edges.add(edge)
    #                     if len(check_edges) == k:
    #                         break
    #     else:
    #         check_edges = {}
    #     return check_edges

    def find_neg_augment_edges(infr, cc1, cc2, k=None):
        """
        Find enough edges to between two pccs to make them k-negative complete
        The two CCs should be disjoint and not have any positive edges between
        them.

        Args:
            cc1 (set): nodes in one PCC
            cc2 (set): nodes in another positive-disjoint PCC
            k (int): redundnacy level (if None uses infr.params['redun.neg'])

        Example:
            >>> # DISABLE_DOCTEST
            >>> from wbia.algo.graph import demo
            >>> k = 2
            >>> cc1, cc2 = {1}, {2, 3}
            >>> # --- return an augmentation if feasible
            >>> infr = demo.demodata_infr(ccs=[cc1, cc2], ignore_pair=True)
            >>> edges = set(infr.find_neg_augment_edges(cc1, cc2, k=k))
            >>> assert edges == {(1, 2), (1, 3)}
            >>> # --- if infeasible return a partial augmentation
            >>> infr.add_feedback((1, 2), INCMP)
            >>> edges = set(infr.find_neg_augment_edges(cc1, cc2, k=k))
            >>> assert edges == {(1, 3)}
        """
        if k is None:
            k = infr.params['redun.neg']
        assert cc1 is not cc2, 'CCs should be disjoint (but they are the same)'
        assert len(cc1.intersection(cc2)) == 0, 'CCs should be disjoint'
        existing_edges = set(nxu.edges_cross(infr.graph, cc1, cc2))

        reviewed_edges = {
            edge: state
            for edge, state in zip(
                existing_edges, infr.edge_decision_from(existing_edges)
            )
            if state != UNREV
        }

        # Find how many negative edges we already have
        num = sum([state == NEGTV for state in reviewed_edges.values()])
        if num < k:
            # Find k random negative edges
            check_edges = existing_edges - set(reviewed_edges)
            # Check the existing but unreviewed edges first
            for edge in check_edges:
                num += 1
                yield edge
                if num >= k:
                    return
            # Check non-existing edges next
            seed = 2827295125
            try:
                seed += sum(cc1) + sum(cc2)
            except Exception:
                pass
            rng = np.random.RandomState(seed)
            cc1 = ut.shuffle(list(cc1), rng=rng)
            cc2 = ut.shuffle(list(cc2), rng=rng)
            cc1 = ut.shuffle(list(cc1), rng=rng)
            for edge in it.starmap(nxu.e_, nxu.diag_product(cc1, cc2)):
                if edge not in existing_edges:
                    num += 1
                    yield edge
                    if num >= k:
                        return

    def find_pos_augment_edges(infr, pcc, k=None):
        """
        # [[1, 0], [0, 2], [1, 2], [3, 1]]
        pos_sub = nx.Graph([[0, 1], [1, 2], [0, 2], [1, 3]])
        """
        if k is None:
            pos_k = infr.params['redun.pos']
        else:
            pos_k = k
        pos_sub = infr.pos_graph.subgraph(pcc)

        # TODO:
        # weight by pairs most likely to be comparable

        # First try to augment only with unreviewed existing edges
        unrev_avail = list(nxu.edges_inside(infr.unreviewed_graph, pcc))
        try:
            check_edges = list(
                nxu.k_edge_augmentation(
                    pos_sub, k=pos_k, avail=unrev_avail, partial=False
                )
            )
        except nx.NetworkXUnfeasible:
            check_edges = None
        if not check_edges:
            # Allow new edges to be introduced
            full_sub = infr.graph.subgraph(pcc).copy()
            new_avail = ut.estarmap(infr.e_, nx.complement(full_sub).edges())
            full_avail = unrev_avail + new_avail
            n_max = (len(pos_sub) * (len(pos_sub) - 1)) // 2
            n_complement = n_max - pos_sub.number_of_edges()
            if len(full_avail) == n_complement:
                # can use the faster algorithm
                check_edges = list(
                    nxu.k_edge_augmentation(pos_sub, k=pos_k, partial=True)
                )
            else:
                # have to use the slow approximate algo
                check_edges = list(
                    nxu.k_edge_augmentation(
                        pos_sub, k=pos_k, avail=full_avail, partial=True
                    )
                )
        check_edges = set(it.starmap(e_, check_edges))
        return check_edges

    @profile
    def find_pos_redun_candidate_edges(infr, k=None, verbose=False):
        r"""
        Searches for augmenting edges that would make PCCs k-positive redundant

        Doctest:
            >>> from wbia.algo.graph.mixin_matching import *  # NOQA
            >>> from wbia.algo.graph import demo
            >>> infr = demo.demodata_infr(ccs=[(1, 2, 3, 4, 5), (7, 8, 9, 10)])
            >>> infr.add_feedback((2, 5), 'match')
            >>> infr.add_feedback((1, 5), 'notcomp')
            >>> infr.params['redun.pos'] = 2
            >>> candidate_edges = list(infr.find_pos_redun_candidate_edges())
            >>> result = ('candidate_edges = ' + ut.repr2(candidate_edges))
            >>> print(result)
            candidate_edges = []
        """
        # Add random edges between exisiting non-redundant PCCs
        if k is None:
            k = infr.params['redun.pos']
        # infr.find_non_pos_redundant_pccs(k=k, relax=True)
        pcc_gen = list(infr.positive_components())
        prog = ut.ProgIter(pcc_gen, enabled=verbose, freq=1, adjust=False)
        for pcc in prog:
            if not infr.is_pos_redundant(pcc, k=k, relax=True, assume_connected=True):
                for edge in infr.find_pos_augment_edges(pcc, k=k):
                    yield nxu.e_(*edge)

    @profile
    def find_neg_redun_candidate_edges(infr, k=None):
        """
        Get pairs of PCCs that are not complete.
        Finds edges that might complete them.

        Example:
            >>> # DISABLE_DOCTEST
            >>> from wbia.algo.graph.mixin_matching import *  # NOQA
            >>> from wbia.algo.graph import demo
            >>> infr = demo.demodata_infr(ccs=[(1,), (2,), (3,)], ignore_pair=True)
            >>> edges = list(infr.find_neg_redun_candidate_edges())
            >>> assert len(edges) == 3, 'all should be needed here'
            >>> infr.add_feedback_from(edges, evidence_decision=NEGTV)
            >>> assert len(list(infr.find_neg_redun_candidate_edges())) == 0

        Example:
            >>> # DISABLE_DOCTEST
            >>> from wbia.algo.graph import demo
            >>> infr = demo.demodata_infr(pcc_sizes=[3] * 20, ignore_pair=True)
            >>> ccs = list(infr.positive_components())
            >>> gen = infr.find_neg_redun_candidate_edges(k=2)
            >>> for edge in gen:
            >>>     # What happens when we make ccs positive
            >>>     print(infr.node_labels(edge))
            >>>     infr.add_feedback(edge, evidence_decision=POSTV)
            >>> import ubelt as ub
            >>> infr = demo.demodata_infr(pcc_sizes=[1] * 30, ignore_pair=True)
            >>> ccs = list(infr.positive_components())
            >>> gen = infr.find_neg_redun_candidate_edges(k=3)
            >>> for chunk in ub.chunks(gen, 2):
            >>>     for edge in chunk:
            >>>         # What happens when we make ccs positive
            >>>         print(infr.node_labels(edge))
            >>>         infr.add_feedback(edge, evidence_decision=POSTV)

            list(gen)
        """
        if k is None:
            k = infr.params['redun.neg']
        # Loop through all pairs
        for cc1, cc2 in infr.find_non_neg_redun_pccs(k=k):
            if len(cc1.intersection(cc2)) > 0:
                # If there is modification of the underlying graph while we
                # iterate, then two ccs may not be disjoint. Skip these cases.
                continue
            for u, v in infr.find_neg_augment_edges(cc1, cc2, k):
                edge = e_(u, v)
                infr.assert_edge(edge)
                yield edge


[docs]class CandidateSearch(_RedundancyAugmentation):
    """Search for candidate edges"""

[docs]    @profile
    def find_lnbnn_candidate_edges(
        infr,
        desired_states=[UNREV],
        can_match_samename=False,
        can_match_sameimg=False,
        K=5,
        Knorm=5,
        requery=True,
        prescore_method='csum',
        score_method='csum',
        sv_on=True,
        cfgdict_=None,
        batch_size=None,
    ):
        """

        Example:
            >>> # DISABLE_DOCTEST
            >>> # xdoctest: +REQUIRES(--slow)
            >>> from wbia.algo.graph import demo
            >>> infr = demo.demodata_mtest_infr()
            >>> cand_edges = infr.find_lnbnn_candidate_edges()
            >>> assert len(cand_edges) > 200, len(cand_edges)
        """
        # Refresh the name labels

        # TODO: abstract into a Ranker class

        # do LNBNN query for new edges
        # Use one-vs-many to establish candidate edges to classify
        cfgdict = {
            'resize_dim': 'width',
            'dim_size': 700,
            'requery': requery,
            'can_match_samename': can_match_samename,
            'can_match_sameimg': can_match_sameimg,
            'K': K,
            'Knorm': Knorm,
            'sv_on': sv_on,
            'prescore_method': prescore_method,
            'score_method': score_method,
        }
        if cfgdict_ is not None:
            cfgdict.update(cfgdict_)

        print('[find_lnbnn_candidate_edges] Using cfgdict = %s' % (ut.repr3(cfgdict),))

        ranks_top = infr.params['ranking.ntop']
        response = infr.exec_matching(
            name_method='edge',
            cfgdict=cfgdict,
            batch_size=batch_size,
            ranks_top=ranks_top,
        )

        if cfgdict_ is None:
            # infr.apply_match_edges(review_cfg={'ranks_top': 5})
            lnbnn_results = set(infr._cm_breaking(review_cfg={'ranks_top': ranks_top}))
        else:
            assert response is not None
            lnbnn_results = set(response)

        candidate_edges = {
            edge
            for edge, state in zip(lnbnn_results, infr.edge_decision_from(lnbnn_results))
            if state in desired_states
        }

        infr.print(
            'ranking alg found {}/{} {} edges'.format(
                len(candidate_edges), len(lnbnn_results), desired_states
            ),
            1,
        )

        return candidate_edges

[docs]    def ensure_task_probs(infr, edges):
        """
        Ensures that probabilities are assigned to the edges.
        This gaurentees that infr.task_probs contains data for edges.
        (Currently only the primary task is actually ensured)

        CommandLine:
            python -m wbia.algo.graph.mixin_matching ensure_task_probs

        Doctest:
            >>> # DISABLE_DOCTEST
            >>> from wbia.algo.graph.mixin_matching import *
            >>> import wbia
            >>> infr = wbia.AnnotInference('PZ_MTEST', aids='all',
            >>>                             autoinit='staging')
            >>> edges = list(infr.edges())[0:3]
            >>> infr.load_published()
            >>> assert len(infr.task_probs['match_state']) == 0
            >>> infr.ensure_task_probs(edges)
            >>> assert len(infr.task_probs['match_state']) == 3
            >>> infr.ensure_task_probs(edges)
            >>> assert len(infr.task_probs['match_state']) == 3

        Doctest:
            >>> # DISABLE_DOCTEST
            >>> from wbia.algo.graph.mixin_matching import *
            >>> from wbia.algo.graph import demo
            >>> infr = demo.demodata_infr(num_pccs=6, p_incon=.5, size_std=2)
            >>> edges = list(infr.edges())
            >>> infr.ensure_task_probs(edges)
            >>> assert all([np.isclose(sum(p.values()), 1)
            >>>             for p in infr.task_probs['match_state'].values()])
        """
        if not infr.verifiers:
            raise Exception('Verifiers are needed to predict probabilities')

        # Construct pairwise features on edges in infr
        primary_task = 'match_state'

        match_task = infr.task_probs[primary_task]
        need_flags = [e not in match_task for e in edges]

        if any(need_flags):
            need_edges = ut.compress(edges, need_flags)
            need_edges = list(set(need_edges))
            infr.print(
                'There are {} edges without probabilities'.format(len(need_edges)), 1
            )

            # Only recompute for the needed edges
            task_probs = infr._make_task_probs(need_edges)
            # Store task probs in internal data structure
            # FIXME: this is slow
            for task, probs in task_probs.items():
                probs_dict = probs.to_dict(orient='index')
                if task not in infr.task_probs:
                    infr.task_probs[task] = probs_dict
                else:
                    infr.task_probs[task].update(probs_dict)

                # Set edge task attribute as well
                infr.set_edge_attrs(task, probs_dict)

[docs]    @profile
    def ensure_priority_scores(infr, priority_edges):
        """
        Ensures that priority attributes are assigned to the edges.
        This does not change the state of the queue.

        Doctest:
            >>> import wbia
            >>> ibs = wbia.opendb('PZ_MTEST')
            >>> infr = wbia.AnnotInference(ibs, aids='all')
            >>> infr.ensure_mst()
            >>> priority_edges = list(infr.edges())[0:1]
            >>> infr.ensure_priority_scores(priority_edges)

        Doctest:
            >>> import wbia
            >>> ibs = wbia.opendb('PZ_MTEST')
            >>> infr = wbia.AnnotInference(ibs, aids='all')
            >>> infr.ensure_mst()
            >>> # infr.load_published()
            >>> priority_edges = list(infr.edges())
            >>> infr.ensure_priority_scores(priority_edges)

        Doctest:
            >>> from wbia.algo.graph import demo
            >>> infr = demo.demodata_infr(num_pccs=6, p_incon=.5, size_std=2)
            >>> edges = list(infr.edges())
            >>> infr.ensure_priority_scores(edges)
        """
        infr.print('Checking for verifiers: %r' % (infr.verifiers,))

        if infr.verifiers and infr.ibs is not None:
            infr.print(
                'Prioritizing {} edges with one-vs-one probs'.format(len(priority_edges)),
                1,
            )
            infr.print('Using thresholds: %r' % (infr.task_thresh,))
            infr.print(
                'Using infr.params[autoreview.enabled]          : %r'
                % (infr.params['autoreview.enabled'],)
            )
            infr.print(
                'Using infr.params[autoreview.prioritize_nonpos]: %r'
                % (infr.params['autoreview.prioritize_nonpos'],)
            )

            infr.ensure_task_probs(priority_edges)
            infr.load_published()

            primary_task = 'match_state'
            match_probs = infr.task_probs[primary_task]
            primary_thresh = infr.task_thresh[primary_task]

            # Read match_probs into a DataFrame
            primary_probs = pd.DataFrame(
                ut.take(match_probs, priority_edges),
                index=nxu.ensure_multi_index(priority_edges, ('aid1', 'aid2')),
            )

            # Convert match-state probabilities into priorities
            prob_match = primary_probs[POSTV]

            # Initialize priorities to probability of matching
            default_priority = prob_match.copy()

            # If the edges are currently between the same individual, then
            # prioritize by non-positive probability (because those edges might
            # expose an inconsistency)
            already_pos = [
                infr.pos_graph.node_label(u) == infr.pos_graph.node_label(v)
                for u, v in priority_edges
            ]
            default_priority[already_pos] = 1 - default_priority[already_pos]

            if infr.params['autoreview.enabled']:
                if infr.params['autoreview.prioritize_nonpos']:
                    # Give positives that pass automatic thresholds high priority
                    _probs = primary_probs[POSTV]
                    flags = _probs > primary_thresh[POSTV]
                    default_priority[flags] = (
                        np.maximum(default_priority[flags], _probs[flags]) + 1
                    )

                    # Give negatives that pass automatic thresholds high priority
                    _probs = primary_probs[NEGTV]
                    flags = _probs > primary_thresh[NEGTV]
                    default_priority[flags] = (
                        np.maximum(default_priority[flags], _probs[flags]) + 1
                    )

                    # Give not-comps that pass automatic thresholds high priority
                    _probs = primary_probs[INCMP]
                    flags = _probs > primary_thresh[INCMP]
                    default_priority[flags] = (
                        np.maximum(default_priority[flags], _probs[flags]) + 1
                    )

            infr.set_edge_attrs('prob_match', prob_match.to_dict())
            infr.set_edge_attrs('default_priority', default_priority.to_dict())

            metric = 'default_priority'
            priority = default_priority
        elif infr.cm_list is not None:
            infr.print(
                'Prioritizing {} edges with one-vs-vsmany scores'.format(
                    len(priority_edges)
                )
            )
            # Not given any deploy classifier, this is the best we can do
            scores = infr._make_lnbnn_scores(priority_edges)
            metric = 'normscore'
            priority = scores
        else:
            infr.print(
                'WARNING: No verifiers to prioritize {} edge(s)'.format(
                    len(priority_edges)
                )
            )
            metric = 'random'
            priority = np.zeros(len(priority_edges)) + 1e-6

        infr.set_edge_attrs(metric, ut.dzip(priority_edges, priority))
        return metric, priority

[docs]    def ensure_prioritized(infr, priority_edges):
        priority_edges = list(priority_edges)
        metric, priority = infr.ensure_priority_scores(priority_edges)
        infr.prioritize(metric=metric, edges=priority_edges, scores=priority)

[docs]    @profile
    def add_candidate_edges(infr, candidate_edges):
        candidate_edges = list(candidate_edges)
        new_edges = infr.ensure_edges_from(candidate_edges)

        if infr.test_mode:
            infr.apply_edge_truth(new_edges)

        if infr.params['redun.enabled']:
            priority_edges = list(infr.filter_edges_flagged_as_redun(candidate_edges))
            infr.print(
                'Got {} candidate edges, {} are new, '
                'and {} are non-redundant'.format(
                    len(candidate_edges), len(new_edges), len(priority_edges)
                )
            )
        else:
            infr.print(
                'Got {} candidate edges and {} are new'.format(
                    len(candidate_edges), len(new_edges)
                )
            )
            priority_edges = candidate_edges

        if len(priority_edges) > 0:
            infr.ensure_prioritized(priority_edges)
            if hasattr(infr, 'on_new_candidate_edges'):
                # hack callback for demo
                infr.on_new_candidate_edges(infr, new_edges)
        return len(priority_edges)

[docs]    @profile
    def refresh_candidate_edges(infr):
        """
        Search for candidate edges.
        Assign each edge a priority and add to queue.
        """
        infr.print('refresh_candidate_edges', 1)
        infr.assert_consistency_invariant()

        if infr.ibs is not None:
            candidate_edges = infr.find_lnbnn_candidate_edges()
        elif hasattr(infr, 'dummy_verif'):
            infr.print('Searching for dummy candidates')
            infr.print(
                'dummy vsone params ='
                + ut.repr4(infr.dummy_verif.dummy_params, nl=1, si=True)
            )
            ranks_top = infr.params['ranking.ntop']
            candidate_edges = infr.dummy_verif.find_candidate_edges(K=ranks_top)
        else:
            raise Exception('No method available to search for candidate edges')
        infr.add_candidate_edges(candidate_edges)
        infr.assert_consistency_invariant()

    @profile
    def _make_task_probs(infr, edges):
        """
        Predict edge probs for each pairwise classifier task
        """
        if infr.verifiers is None:
            raise ValueError('no classifiers exist')
        if not isinstance(infr.verifiers, dict):
            raise NotImplementedError('need to deploy or implement eval prediction')
        task_keys = list(infr.verifiers.keys())
        task_probs = {}
        # infr.print('[make_taks_probs] predict {} for {} edges'.format(
        #     ut.conj_phrase(task_keys, 'and'), len(edges)))
        for task_key in task_keys:
            infr.print('predict {} for {} edges'.format(task_key, len(edges)))
            verif = infr.verifiers[task_key]
            probs_df = verif.predict_proba_df(edges)
            task_probs[task_key] = probs_df
        return task_probs

    @profile
    def _make_lnbnn_scores(infr, edges):
        edge_to_data = infr._get_cm_edge_data(edges)
        edges = list(edge_to_data.keys())
        edge_scores = list(ut.take_column(edge_to_data.values(), 'score'))
        edge_scores = ut.replace_nones(edge_scores, np.nan)
        edge_scores = np.array(edge_scores)
        # take the inf-norm
        normscores = edge_scores / vt.safe_max(edge_scores, nans=False)
        return normscores