Source code for wbia.algo.verif.deploy

# -*- coding: utf-8 -*-
import logging
from os.path import join, exists, basename
from wbia.algo.verif import sklearn_utils
from wbia.algo.verif import verifier
import utool as ut

print, rrr, profile = ut.inject2(__name__)
logger = logging.getLogger('wbia')


[docs]@ut.reloadable_class
class Deployer(object):
    """
    Transforms a OneVsOne problem into a deployable model.
    Registers and loads published models.
    """

    fname_parts = [
        'vsone',
        '{species}',
        '{task_key}',
        '{clf_key}',
        '{n_dims}',
        '{hashid}',
    ]

    fname_fmtstr = '.'.join(fname_parts)

    meta_suffix = '.meta.json'

    publish_info = {
        'remote': 'cthulhu.dyn.wildme.io',
        'path': '/data/public/models/pairclf',
    }

    published = {
        'zebra_grevys': {
            # 'photobomb_state': 'vsone.zebra_grevys.photobomb_state.RF.131.thwzdtnkjcwjqeve.cPkl',
            # 'match_state': 'vsone.zebra_grevys.match_state.RF.131.tranflbhimyzeeqi.cPkl',  # OLD PRE-TRAINED 0
            # 'match_state': 'vsone.zebra_grevys.match_state.RF.131.dlncrbzlpwjyqrdx.cPkl',  # OLD PRE-TRAINED 1
            # 'match_state': 'vsone.zebra_grevys.match_state.RF.131.kukigovqipdrjihg.ggr2.cPkl',  # GGR2 0
            # 'match_state': 'vsone.zebra_grevys.match_state.RF.131.qysrjnzuiziikxzp.kaia.cPkl',  # Kaia GZ CAs
            # 'match_state': 'vsone.zebra_grevys.match_state.RF.131.djvqkmyzrjgaudok.ggr2.cPkl',  # GGR2 1
            # 'match_state': 'vsone.zebra_grevys.match_state.RF.131.wwntfcphwligxjgy.cPkl',  # NAMED ANNOTS
            'match_state': 'vsone.zebra_grevys.match_state.RF.131.qwmzlhlnnsgzropq.cPkl',  # CA
        },
        'zebra_grevys+_canonical_': {
            'match_state': 'vsone.zebra_grevys+_canonical_.match_state.RF.107.cusnlyxbberandka.cPkl',  # CA Region
        },
        'zebra_mountain': {
            'match_state': 'vsone.zebra_mountain.match_state.RF.131.lciwhwikfycthvva.cPkl',
        },
        'zebra_plains': {
            'match_state': 'vsone.zebra_plains.match_state.RF.131.eurizlstehqjvlsu.cPkl',  # OLD PRE-TRAINED
        },
        'giraffe_reticulated': {
            # 'match_state': 'vsone.giraffe_reticulated.match_state.RF.107.clvhhvwgwxpflnhu.ggr2.cPkl',  # GGR2 0
            'match_state': 'vsone.giraffe_reticulated.match_state.RF.131.kqbaqnrdyxpjrzjd.ggr2.cPkl',  # GGR2 1
        },
    }

    def __init__(self, dpath='.', pblm=None):
        self.dpath = dpath
        self.pblm = pblm

    def _load_published(self, ibs, species, task_key):
        """
        >>> from wbia.algo.verif.vsone import *  # NOQA
        >>> self = Deployer()
        >>> species = 'zebra_plains'
        >>> task_key = 'match_state'
        """

        base_url = 'https://{remote}/public/models/pairclf'.format(**self.publish_info)

        task_fnames = self.published[species]
        fname = task_fnames[task_key]

        grabkw = dict(appname='wbia', check_hash=False, verbose=0)

        meta_url = base_url + '/' + fname + self.meta_suffix
        meta_fpath = ut.grab_file_url(meta_url, **grabkw)  # NOQA

        deploy_url = base_url + '/' + fname
        deploy_fpath = ut.grab_file_url(deploy_url, **grabkw)

        verif = self._make_verifier(ibs, deploy_fpath, task_key)
        return verif

    def _make_ensemble_verifier(self, task_key, clf_key, data_key):
        pblm = self.pblm
        ibs = pblm.infr.ibs
        data_info = pblm.feat_extract_info[data_key]
        # Hack together an ensemble verifier
        clf_list = pblm.eval_task_clfs[task_key][clf_key][data_key]
        labels = pblm.samples.subtasks[task_key]
        eclf = sklearn_utils.voting_ensemble(clf_list, voting='soft')
        deploy_info = {
            'clf': eclf,
            'metadata': {
                'task_key': task_key,
                'clf_key': 'ensemble({})'.format(data_key),
                'data_key': data_key,
                'class_names': labels.class_names,
                'data_info': data_info,
            },
        }
        verif = verifier.Verifier(ibs, deploy_info)
        return verif

    def _make_verifier(self, ibs, deploy_fpath, task_key):
        """
        Ignore:
            # py3 side
            clf = deploy_info['clf']
            a = clf.estimators_[0]
            b = a.tree_
            ut.save_data('_tree.pkl', b)
            c = b.__getstate__()
            d = c['nodes']
            ut.save_data('_nodes.pkl', d)

            a.estimators_[0].tree_.__getstate__()['nodes']


        Ignore:
            # py2 side
            ut.load_data('_tree.pkl')
            ut.load_data('_nodes.pkl')

            >>> from wbia.algo.verif.vsone import *  # NOQA
            >>> params = dict(sample_method='random')
            >>> pblm = OneVsOneProblem.from_empty('PZ_MTEST', **params)
            >>> pblm.setup(with_simple=False)
            >>> task_key = pblm.primary_task_key
            >>> self = Deployer(dpath='.', pblm=pblm)
            >>> deploy_info = self.deploy()

            a = deploy_info['clf']
            d = a.estimators_[0].tree_.__getstate__()['nodes']


        Ignore:
            I'm having a similar issue when trying to use python2 to load a
            sklearn RandomForestClassifier that I saved in python3. I created a
            MWE.

            In python 3

                import numpy as np
                import pickle
                data = np.array(
                    [( 1, 26, 69,   5.32214928e+00,  0.69562945, 563,  908.,  1),
                     ( 2,  7, 62,   1.74883020e+00,  0.33854101, 483,  780.,  1),
                     (-1, -1, -2,  -2.00000000e+00,  0.76420451,   7,    9., -2),
                     (-1, -1, -2,  -2.00000000e+00,  0.        ,  62,  106., -2)],
                  dtype=[('left_child', '<i8'), ('right_child', '<i8'),
                  ('feature', '<i8'), ('threshold', '<f8'), ('impurity',
                  '<f8'), ('n_node_samples', '<i8'),
                  ('weighted_n_node_samples', '<f8'), ('missing_direction',
                  '<i8')])

                # Save using pickle
                with open('data.pkl', 'wb') as file_:
                    # Use protocol 2 to support python2 and 3
                    pickle.dump(data, file_, protocol=2)

                # Save with numpy directly
                np.save('data.npy', data)

            Then in python 2
                # Load with pickle
                import pickle
                with open('data.pkl', 'rb') as file_:
                    data = pickle.load(file_)
                # This results in `ValueError: non-string names in Numpy dtype unpickling`

                # Load with numpy directly
                data = np.load('data.npy')
                # This works

            However this still doesn't make sklearn play nice between 2 and 3.
            So, how can we get pickle to load this numpy object correctly?
            Here is the fix suggested in the link:

                from lib2to3.fixes.fix_imports import MAPPING
                import sys
                import pickle

                # MAPPING maps Python 2 names to Python 3 names. We want this in reverse.
                REVERSE_MAPPING = {}
                for key, val in MAPPING.items():
                    REVERSE_MAPPING[val] = key

                # We can override the Unpickler and loads
                class Python_3_Unpickler(pickle.Unpickler):
                    def find_class(self, module, name):
                        if module in REVERSE_MAPPING:
                            module = REVERSE_MAPPING[module]
                        __import__(module)
                        mod = sys.modules[module]
                        klass = getattr(mod, name)
                        return klass

                with open('data.pkl', 'rb') as file_:
                    data = Python_3_Unpickler(file_).load()

            This still doesn't work



            https://stackoverflow.com/questions/41720952/unpickle-sklearn-tree-descisiontreeregressor-in-python-2-from-python3

        """
        deploy_info = ut.load_data(deploy_fpath)
        verif = verifier.Verifier(ibs, deploy_info=deploy_info)
        if task_key is not None:
            assert (
                verif.metadata['task_key'] == task_key
            ), 'bad saved clf at fpath={}'.format(deploy_fpath)
        return verif

[docs]    def load_published(self, ibs, species):
        task_fnames = self.published[species]
        logger.info('loading published: %r' % (task_fnames,))
        classifiers = {
            task_key: self._load_published(ibs, species, task_key)
            for task_key in task_fnames.keys()
        }
        logger.info('loaded classifiers: %r' % (classifiers,))
        return classifiers

[docs]    def find_pretrained(self):
        import glob
        import parse

        fname_fmt = self.fname_fmtstr + '.cPkl'
        task_clf_candidates = ut.ddict(list)
        globstr = self.fname_parts[0] + '.*.cPkl'
        for fpath in glob.iglob(join(self.dpath, globstr)):
            fname = basename(fpath)
            result = parse.parse(fname_fmt, fname)
            if result:
                task_key = result.named['task_key']
                task_clf_candidates[task_key].append(fpath)
        return task_clf_candidates

[docs]    def find_latest_remote(self):
        """
        Used to update the published dict

        CommandLine:
            python -m wbia.algo.verif.vsone find_latest_remote

        Example:
            >>> # DISABLE_DOCTEST
            >>> from wbia.algo.verif.vsone import *  # NOQA
            >>> self = Deployer()
            >>> task_clf_names = self.find_latest_remote()
        """
        base_url = 'https://{remote}/public/models/pairclf'.format(**self.publish_info)
        import requests
        import bs4

        resp = requests.get(base_url)
        soup = bs4.BeautifulSoup(resp.text, 'html.parser')
        table = soup.findAll('table')[0]

        def parse_bs_table(table):
            n_columns = 0
            n_rows = 0
            column_names = []
            # Find number of rows and columns
            # we also find the column titles if we can
            for row in table.find_all('tr'):
                td_tags = row.find_all('td')
                if len(td_tags) > 0:
                    n_rows += 1
                    if n_columns == 0:
                        n_columns = len(td_tags)
                # Handle column names if we find them
                th_tags = row.find_all('th')
                if len(th_tags) > 0 and len(column_names) == 0:
                    for th in th_tags:
                        column_names.append(th.get_text())

            # Safeguard on Column Titles
            if len(column_names) > 0 and len(column_names) != n_columns:
                raise Exception('Column titles do not match the number of columns')
            columns = column_names if len(column_names) > 0 else range(0, n_columns)
            import pandas as pd

            df = pd.DataFrame(columns=columns, index=list(range(0, n_rows)))
            row_marker = 0
            for row in table.find_all('tr'):
                column_marker = 0
                columns = row.find_all('td')
                for column in columns:
                    df.iat[row_marker, column_marker] = column.get_text().strip()
                    column_marker += 1
                if len(columns) > 0:
                    row_marker += 1
            return df

        df = parse_bs_table(table)
        # Find all available models
        df = df[df['Name'].map(lambda x: x.endswith('.cPkl'))]
        # df = df[df['Last modified'].map(len) > 0]

        fname_fmt = self.fname_fmtstr + '.cPkl'
        task_clf_candidates = ut.ddict(list)
        import parse

        for idx, row in df.iterrows():
            fname = basename(row['Name'])
            result = parse.parse(fname_fmt, fname)
            if result:
                task_key = result.named['task_key']
                species = result.named['species']
                task_clf_candidates[(species, task_key)].append(idx)

        task_clf_fnames = ut.ddict(dict)
        for key, idxs in task_clf_candidates.items():
            species, task_key = key
            # Find the classifier most recently created
            max_idx = ut.argmax(df.loc[idxs]['Last modified'].tolist())
            fname = df.loc[idxs[max_idx]]['Name']
            task_clf_fnames[species][task_key] = fname

        logger.info('published = ' + ut.repr2(task_clf_fnames, nl=2))
        return task_clf_fnames

[docs]    def find_latest_local(self):
        """
        >>> self = Deployer()
        >>> self.find_pretrained()
        >>> self.find_latest_local()
        """
        from os.path import getctime

        task_clf_candidates = self.find_pretrained()
        task_clf_fpaths = {}
        for task_key, fpaths in task_clf_candidates.items():
            # Find the classifier most recently created
            fpath = fpaths[ut.argmax(map(getctime, fpaths))]
            task_clf_fpaths[task_key] = fpath
        return task_clf_fpaths

    def _make_deploy_metadata(self, task_key=None):
        pblm = self.pblm
        if pblm.samples is None:
            pblm.setup()

        if task_key is None:
            task_key = pblm.primary_task_key

        # task_keys = list(pblm.samples.supported_tasks())
        clf_key = pblm.default_clf_key
        data_key = pblm.default_data_key

        # Save the classifie
        data_info = pblm.feat_extract_info[data_key]
        feat_extract_config, feat_dims = data_info

        samples = pblm.samples
        labels = samples.subtasks[task_key]

        edge_hashid = samples.edge_set_hashid()
        label_hashid = samples.task_label_hashid(task_key)
        tasksamp_hashid = samples.task_sample_hashid(task_key)

        annot_hashid = ut.hashid_arr(samples._unique_annots.visual_uuids, 'annots')

        # species = pblm.infr.ibs.get_primary_database_species(
        #     samples._unique_annots.aid)
        species = '+'.join(sorted(set(samples._unique_annots.species)))

        metadata = {
            'tasksamp_hashid': tasksamp_hashid,
            'edge_hashid': edge_hashid,
            'label_hashid': label_hashid,
            'annot_hashid': annot_hashid,
            'class_hist': labels.make_histogram(),
            'class_names': labels.class_names,
            'data_info': data_info,
            'task_key': task_key,
            'species': species,
            'data_key': data_key,
            'clf_key': clf_key,
            'n_dims': len(feat_dims),
            # 'aid_pairs': samples.aid_pairs,
        }

        meta_cfgstr = ut.repr2(metadata, kvsep=':', itemsep='', si=True)
        hashid = ut.hash_data(meta_cfgstr)[0:16]

        deploy_fname = self.fname_fmtstr.format(hashid=hashid, **metadata) + '.cPkl'

        deploy_metadata = metadata.copy()
        deploy_metadata['hashid'] = hashid
        deploy_metadata['fname'] = deploy_fname
        return deploy_metadata, deploy_fname

    def _make_deploy_info(self, task_key=None):
        pblm = self.pblm
        if pblm.samples is None:
            pblm.setup()

        if task_key is None:
            task_key = pblm.primary_task_key

        deploy_metadata, deploy_fname = self._make_deploy_metadata(task_key)
        clf_key = deploy_metadata['clf_key']
        data_key = deploy_metadata['data_key']

        clf = None
        if pblm.deploy_task_clfs:
            clf = pblm.deploy_task_clfs[task_key][clf_key][data_key]
        if not clf:
            pblm.learn_deploy_classifiers([task_key], clf_key, data_key)
            clf = pblm.deploy_task_clfs[task_key][clf_key][data_key]

        deploy_info = {
            'clf': clf,
            'metadata': deploy_metadata,
        }
        return deploy_info

[docs]    def ensure(self, task_key):
        _, fname = self._make_deploy_metadata(task_key=task_key)
        fpath = join(self.dpath, fname)
        if exists(fpath):
            deploy_info = ut.load_data(fpath)
            assert bool(deploy_info['clf']), 'must have clf'
        else:
            deploy_info = self.deploy(task_key=task_key)
            assert exists(fpath), 'must now exist'
        verif = verifier.Verifier(self.pblm.infr.ibs, deploy_info=deploy_info)
        assert verif.metadata['task_key'] == task_key, 'bad saved clf at fpath={}'.format(
            fpath
        )
        return verif

[docs]    def deploy(self, task_key=None, publish=False):
        """
        Trains and saves a classifier for deployment

        Notes:
            A deployment consists of the following information
                * The classifier itself
                * Information needed to construct the input to the classifier
                    - TODO: can this be encoded as an sklearn pipeline?
                * Metadata concerning what data the classifier was trained with
                * PUBLISH TO /media/hdd/PUBLIC/models/pairclf

        Example:
            >>> # xdoctest: +REQUIRES(module:wbia_cnn, --slow)
            >>> from wbia.algo.verif.vsone import *  # NOQA
            >>> params = dict(sample_method='random')
            >>> pblm = OneVsOneProblem.from_empty('PZ_MTEST', **params)
            >>> pblm.setup(with_simple=False)
            >>> task_key = pblm.primary_task_key
            >>> self = Deployer(dpath='.', pblm=pblm)
            >>> deploy_info = self.deploy()

        Ignore:
            pblm.evaluate_classifiers(with_simple=False)
            res = pblm.task_combo_res[pblm.primary_task_key]['RF']['learn(sum,glob)']
        """
        deploy_info = self._make_deploy_info(task_key=task_key)
        deploy_fname = deploy_info['metadata']['fname']

        meta_fname = deploy_fname + self.meta_suffix
        deploy_fpath = join(self.dpath, deploy_fname)
        meta_fpath = join(self.dpath, meta_fname)

        ut.save_json(meta_fpath, deploy_info['metadata'])
        ut.save_data(deploy_fpath, deploy_info)

        if publish:
            user = ut.get_user_name()
            remote_uri = '{user}@{remote}:{path}'.format(user=user, **self.publish_info)

            ut.rsync(meta_fpath, remote_uri + '/' + meta_fname)
            ut.rsync(deploy_fpath, remote_uri + '/' + deploy_fname)
        return deploy_info