# -*- coding: utf-8 -*-
import logging
from os.path import join, exists, basename
from wbia.algo.verif import sklearn_utils
from wbia.algo.verif import verifier
import utool as ut
print, rrr, profile = ut.inject2(__name__)
logger = logging.getLogger('wbia')
[docs]@ut.reloadable_class
class Deployer(object):
"""
Transforms a OneVsOne problem into a deployable model.
Registers and loads published models.
"""
fname_parts = [
'vsone',
'{species}',
'{task_key}',
'{clf_key}',
'{n_dims}',
'{hashid}',
]
fname_fmtstr = '.'.join(fname_parts)
meta_suffix = '.meta.json'
publish_info = {
'remote': 'cthulhu.dyn.wildme.io',
'path': '/data/public/models/pairclf',
}
published = {
'zebra_grevys': {
# 'photobomb_state': 'vsone.zebra_grevys.photobomb_state.RF.131.thwzdtnkjcwjqeve.cPkl',
# 'match_state': 'vsone.zebra_grevys.match_state.RF.131.tranflbhimyzeeqi.cPkl', # OLD PRE-TRAINED 0
# 'match_state': 'vsone.zebra_grevys.match_state.RF.131.dlncrbzlpwjyqrdx.cPkl', # OLD PRE-TRAINED 1
# 'match_state': 'vsone.zebra_grevys.match_state.RF.131.kukigovqipdrjihg.ggr2.cPkl', # GGR2 0
# 'match_state': 'vsone.zebra_grevys.match_state.RF.131.qysrjnzuiziikxzp.kaia.cPkl', # Kaia GZ CAs
# 'match_state': 'vsone.zebra_grevys.match_state.RF.131.djvqkmyzrjgaudok.ggr2.cPkl', # GGR2 1
# 'match_state': 'vsone.zebra_grevys.match_state.RF.131.wwntfcphwligxjgy.cPkl', # NAMED ANNOTS
'match_state': 'vsone.zebra_grevys.match_state.RF.131.qwmzlhlnnsgzropq.cPkl', # CA
},
'zebra_grevys+_canonical_': {
'match_state': 'vsone.zebra_grevys+_canonical_.match_state.RF.107.cusnlyxbberandka.cPkl', # CA Region
},
'zebra_mountain': {
'match_state': 'vsone.zebra_mountain.match_state.RF.131.lciwhwikfycthvva.cPkl',
},
'zebra_plains': {
'match_state': 'vsone.zebra_plains.match_state.RF.131.eurizlstehqjvlsu.cPkl', # OLD PRE-TRAINED
},
'giraffe_reticulated': {
# 'match_state': 'vsone.giraffe_reticulated.match_state.RF.107.clvhhvwgwxpflnhu.ggr2.cPkl', # GGR2 0
'match_state': 'vsone.giraffe_reticulated.match_state.RF.131.kqbaqnrdyxpjrzjd.ggr2.cPkl', # GGR2 1
},
}
def __init__(self, dpath='.', pblm=None):
self.dpath = dpath
self.pblm = pblm
def _load_published(self, ibs, species, task_key):
"""
>>> from wbia.algo.verif.vsone import * # NOQA
>>> self = Deployer()
>>> species = 'zebra_plains'
>>> task_key = 'match_state'
"""
base_url = 'https://{remote}/public/models/pairclf'.format(**self.publish_info)
task_fnames = self.published[species]
fname = task_fnames[task_key]
grabkw = dict(appname='wbia', check_hash=False, verbose=0)
meta_url = base_url + '/' + fname + self.meta_suffix
meta_fpath = ut.grab_file_url(meta_url, **grabkw) # NOQA
deploy_url = base_url + '/' + fname
deploy_fpath = ut.grab_file_url(deploy_url, **grabkw)
verif = self._make_verifier(ibs, deploy_fpath, task_key)
return verif
def _make_ensemble_verifier(self, task_key, clf_key, data_key):
pblm = self.pblm
ibs = pblm.infr.ibs
data_info = pblm.feat_extract_info[data_key]
# Hack together an ensemble verifier
clf_list = pblm.eval_task_clfs[task_key][clf_key][data_key]
labels = pblm.samples.subtasks[task_key]
eclf = sklearn_utils.voting_ensemble(clf_list, voting='soft')
deploy_info = {
'clf': eclf,
'metadata': {
'task_key': task_key,
'clf_key': 'ensemble({})'.format(data_key),
'data_key': data_key,
'class_names': labels.class_names,
'data_info': data_info,
},
}
verif = verifier.Verifier(ibs, deploy_info)
return verif
def _make_verifier(self, ibs, deploy_fpath, task_key):
"""
Ignore:
# py3 side
clf = deploy_info['clf']
a = clf.estimators_[0]
b = a.tree_
ut.save_data('_tree.pkl', b)
c = b.__getstate__()
d = c['nodes']
ut.save_data('_nodes.pkl', d)
a.estimators_[0].tree_.__getstate__()['nodes']
Ignore:
# py2 side
ut.load_data('_tree.pkl')
ut.load_data('_nodes.pkl')
>>> from wbia.algo.verif.vsone import * # NOQA
>>> params = dict(sample_method='random')
>>> pblm = OneVsOneProblem.from_empty('PZ_MTEST', **params)
>>> pblm.setup(with_simple=False)
>>> task_key = pblm.primary_task_key
>>> self = Deployer(dpath='.', pblm=pblm)
>>> deploy_info = self.deploy()
a = deploy_info['clf']
d = a.estimators_[0].tree_.__getstate__()['nodes']
Ignore:
I'm having a similar issue when trying to use python2 to load a
sklearn RandomForestClassifier that I saved in python3. I created a
MWE.
In python 3
import numpy as np
import pickle
data = np.array(
[( 1, 26, 69, 5.32214928e+00, 0.69562945, 563, 908., 1),
( 2, 7, 62, 1.74883020e+00, 0.33854101, 483, 780., 1),
(-1, -1, -2, -2.00000000e+00, 0.76420451, 7, 9., -2),
(-1, -1, -2, -2.00000000e+00, 0. , 62, 106., -2)],
dtype=[('left_child', '<i8'), ('right_child', '<i8'),
('feature', '<i8'), ('threshold', '<f8'), ('impurity',
'<f8'), ('n_node_samples', '<i8'),
('weighted_n_node_samples', '<f8'), ('missing_direction',
'<i8')])
# Save using pickle
with open('data.pkl', 'wb') as file_:
# Use protocol 2 to support python2 and 3
pickle.dump(data, file_, protocol=2)
# Save with numpy directly
np.save('data.npy', data)
Then in python 2
# Load with pickle
import pickle
with open('data.pkl', 'rb') as file_:
data = pickle.load(file_)
# This results in `ValueError: non-string names in Numpy dtype unpickling`
# Load with numpy directly
data = np.load('data.npy')
# This works
However this still doesn't make sklearn play nice between 2 and 3.
So, how can we get pickle to load this numpy object correctly?
Here is the fix suggested in the link:
from lib2to3.fixes.fix_imports import MAPPING
import sys
import pickle
# MAPPING maps Python 2 names to Python 3 names. We want this in reverse.
REVERSE_MAPPING = {}
for key, val in MAPPING.items():
REVERSE_MAPPING[val] = key
# We can override the Unpickler and loads
class Python_3_Unpickler(pickle.Unpickler):
def find_class(self, module, name):
if module in REVERSE_MAPPING:
module = REVERSE_MAPPING[module]
__import__(module)
mod = sys.modules[module]
klass = getattr(mod, name)
return klass
with open('data.pkl', 'rb') as file_:
data = Python_3_Unpickler(file_).load()
This still doesn't work
https://stackoverflow.com/questions/41720952/unpickle-sklearn-tree-descisiontreeregressor-in-python-2-from-python3
"""
deploy_info = ut.load_data(deploy_fpath)
verif = verifier.Verifier(ibs, deploy_info=deploy_info)
if task_key is not None:
assert (
verif.metadata['task_key'] == task_key
), 'bad saved clf at fpath={}'.format(deploy_fpath)
return verif
[docs] def load_published(self, ibs, species):
task_fnames = self.published[species]
logger.info('loading published: %r' % (task_fnames,))
classifiers = {
task_key: self._load_published(ibs, species, task_key)
for task_key in task_fnames.keys()
}
logger.info('loaded classifiers: %r' % (classifiers,))
return classifiers
[docs] def find_pretrained(self):
import glob
import parse
fname_fmt = self.fname_fmtstr + '.cPkl'
task_clf_candidates = ut.ddict(list)
globstr = self.fname_parts[0] + '.*.cPkl'
for fpath in glob.iglob(join(self.dpath, globstr)):
fname = basename(fpath)
result = parse.parse(fname_fmt, fname)
if result:
task_key = result.named['task_key']
task_clf_candidates[task_key].append(fpath)
return task_clf_candidates
[docs] def find_latest_remote(self):
"""
Used to update the published dict
CommandLine:
python -m wbia.algo.verif.vsone find_latest_remote
Example:
>>> # DISABLE_DOCTEST
>>> from wbia.algo.verif.vsone import * # NOQA
>>> self = Deployer()
>>> task_clf_names = self.find_latest_remote()
"""
base_url = 'https://{remote}/public/models/pairclf'.format(**self.publish_info)
import requests
import bs4
resp = requests.get(base_url)
soup = bs4.BeautifulSoup(resp.text, 'html.parser')
table = soup.findAll('table')[0]
def parse_bs_table(table):
n_columns = 0
n_rows = 0
column_names = []
# Find number of rows and columns
# we also find the column titles if we can
for row in table.find_all('tr'):
td_tags = row.find_all('td')
if len(td_tags) > 0:
n_rows += 1
if n_columns == 0:
n_columns = len(td_tags)
# Handle column names if we find them
th_tags = row.find_all('th')
if len(th_tags) > 0 and len(column_names) == 0:
for th in th_tags:
column_names.append(th.get_text())
# Safeguard on Column Titles
if len(column_names) > 0 and len(column_names) != n_columns:
raise Exception('Column titles do not match the number of columns')
columns = column_names if len(column_names) > 0 else range(0, n_columns)
import pandas as pd
df = pd.DataFrame(columns=columns, index=list(range(0, n_rows)))
row_marker = 0
for row in table.find_all('tr'):
column_marker = 0
columns = row.find_all('td')
for column in columns:
df.iat[row_marker, column_marker] = column.get_text().strip()
column_marker += 1
if len(columns) > 0:
row_marker += 1
return df
df = parse_bs_table(table)
# Find all available models
df = df[df['Name'].map(lambda x: x.endswith('.cPkl'))]
# df = df[df['Last modified'].map(len) > 0]
fname_fmt = self.fname_fmtstr + '.cPkl'
task_clf_candidates = ut.ddict(list)
import parse
for idx, row in df.iterrows():
fname = basename(row['Name'])
result = parse.parse(fname_fmt, fname)
if result:
task_key = result.named['task_key']
species = result.named['species']
task_clf_candidates[(species, task_key)].append(idx)
task_clf_fnames = ut.ddict(dict)
for key, idxs in task_clf_candidates.items():
species, task_key = key
# Find the classifier most recently created
max_idx = ut.argmax(df.loc[idxs]['Last modified'].tolist())
fname = df.loc[idxs[max_idx]]['Name']
task_clf_fnames[species][task_key] = fname
logger.info('published = ' + ut.repr2(task_clf_fnames, nl=2))
return task_clf_fnames
[docs] def find_latest_local(self):
"""
>>> self = Deployer()
>>> self.find_pretrained()
>>> self.find_latest_local()
"""
from os.path import getctime
task_clf_candidates = self.find_pretrained()
task_clf_fpaths = {}
for task_key, fpaths in task_clf_candidates.items():
# Find the classifier most recently created
fpath = fpaths[ut.argmax(map(getctime, fpaths))]
task_clf_fpaths[task_key] = fpath
return task_clf_fpaths
def _make_deploy_metadata(self, task_key=None):
pblm = self.pblm
if pblm.samples is None:
pblm.setup()
if task_key is None:
task_key = pblm.primary_task_key
# task_keys = list(pblm.samples.supported_tasks())
clf_key = pblm.default_clf_key
data_key = pblm.default_data_key
# Save the classifie
data_info = pblm.feat_extract_info[data_key]
feat_extract_config, feat_dims = data_info
samples = pblm.samples
labels = samples.subtasks[task_key]
edge_hashid = samples.edge_set_hashid()
label_hashid = samples.task_label_hashid(task_key)
tasksamp_hashid = samples.task_sample_hashid(task_key)
annot_hashid = ut.hashid_arr(samples._unique_annots.visual_uuids, 'annots')
# species = pblm.infr.ibs.get_primary_database_species(
# samples._unique_annots.aid)
species = '+'.join(sorted(set(samples._unique_annots.species)))
metadata = {
'tasksamp_hashid': tasksamp_hashid,
'edge_hashid': edge_hashid,
'label_hashid': label_hashid,
'annot_hashid': annot_hashid,
'class_hist': labels.make_histogram(),
'class_names': labels.class_names,
'data_info': data_info,
'task_key': task_key,
'species': species,
'data_key': data_key,
'clf_key': clf_key,
'n_dims': len(feat_dims),
# 'aid_pairs': samples.aid_pairs,
}
meta_cfgstr = ut.repr2(metadata, kvsep=':', itemsep='', si=True)
hashid = ut.hash_data(meta_cfgstr)[0:16]
deploy_fname = self.fname_fmtstr.format(hashid=hashid, **metadata) + '.cPkl'
deploy_metadata = metadata.copy()
deploy_metadata['hashid'] = hashid
deploy_metadata['fname'] = deploy_fname
return deploy_metadata, deploy_fname
def _make_deploy_info(self, task_key=None):
pblm = self.pblm
if pblm.samples is None:
pblm.setup()
if task_key is None:
task_key = pblm.primary_task_key
deploy_metadata, deploy_fname = self._make_deploy_metadata(task_key)
clf_key = deploy_metadata['clf_key']
data_key = deploy_metadata['data_key']
clf = None
if pblm.deploy_task_clfs:
clf = pblm.deploy_task_clfs[task_key][clf_key][data_key]
if not clf:
pblm.learn_deploy_classifiers([task_key], clf_key, data_key)
clf = pblm.deploy_task_clfs[task_key][clf_key][data_key]
deploy_info = {
'clf': clf,
'metadata': deploy_metadata,
}
return deploy_info
[docs] def ensure(self, task_key):
_, fname = self._make_deploy_metadata(task_key=task_key)
fpath = join(self.dpath, fname)
if exists(fpath):
deploy_info = ut.load_data(fpath)
assert bool(deploy_info['clf']), 'must have clf'
else:
deploy_info = self.deploy(task_key=task_key)
assert exists(fpath), 'must now exist'
verif = verifier.Verifier(self.pblm.infr.ibs, deploy_info=deploy_info)
assert verif.metadata['task_key'] == task_key, 'bad saved clf at fpath={}'.format(
fpath
)
return verif
[docs] def deploy(self, task_key=None, publish=False):
"""
Trains and saves a classifier for deployment
Notes:
A deployment consists of the following information
* The classifier itself
* Information needed to construct the input to the classifier
- TODO: can this be encoded as an sklearn pipeline?
* Metadata concerning what data the classifier was trained with
* PUBLISH TO /media/hdd/PUBLIC/models/pairclf
Example:
>>> # xdoctest: +REQUIRES(module:wbia_cnn, --slow)
>>> from wbia.algo.verif.vsone import * # NOQA
>>> params = dict(sample_method='random')
>>> pblm = OneVsOneProblem.from_empty('PZ_MTEST', **params)
>>> pblm.setup(with_simple=False)
>>> task_key = pblm.primary_task_key
>>> self = Deployer(dpath='.', pblm=pblm)
>>> deploy_info = self.deploy()
Ignore:
pblm.evaluate_classifiers(with_simple=False)
res = pblm.task_combo_res[pblm.primary_task_key]['RF']['learn(sum,glob)']
"""
deploy_info = self._make_deploy_info(task_key=task_key)
deploy_fname = deploy_info['metadata']['fname']
meta_fname = deploy_fname + self.meta_suffix
deploy_fpath = join(self.dpath, deploy_fname)
meta_fpath = join(self.dpath, meta_fname)
ut.save_json(meta_fpath, deploy_info['metadata'])
ut.save_data(deploy_fpath, deploy_info)
if publish:
user = ut.get_user_name()
remote_uri = '{user}@{remote}:{path}'.format(user=user, **self.publish_info)
ut.rsync(meta_fpath, remote_uri + '/' + meta_fname)
ut.rsync(deploy_fpath, remote_uri + '/' + deploy_fname)
return deploy_info