#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
from os.path import splitext, join, exists, commonprefix
import utool as ut
import re
(print, rrr, profile) = ut.inject2(__name__, '[getshark]')
logger = logging.getLogger('wbia')
[docs]def sync_wildbook():
"""
MAIN ENTRY POINT
Syncronizes our wbia database with a wildbook database like whaleshark.org
#cd ~/work/WS_ALL
python -m wbia.scripts.getshark
cd /media/raid/raw/WhaleSharks_WB/
>>> from wbia.scripts.getshark import * # NOQA
"""
from wbia.scripts import getshark
# Prepare the output directory for writing, if it doesn't exist
if True:
# Read ALL data from whaleshark.org
parsed = getshark.parse_whaleshark_org()
db = 'WS_ALL'
species = 'whale_shark'
# images_url = 'http://www.whaleshark.org/listImages.jsp'
# keyword_url = 'http://www.whaleshark.org/getKeywordImages.jsp'
download_dir = join('/media/raid/raw/Wildbook/', 'sharkimages')
if False:
# Read ALL data from whaleshark.org
images_url = 'http://www.mantamatcher.org/listImages.jsp'
keyword_url = None
db = 'Mantas'
species = 'manta_ray'
parsed = parse_wildbook(images_url, keyword_url)
download_dir = join('/media/raid/raw/Wildbook/', 'mantas')
if False:
# Read ALL data from whaleshark.org
images_url = 'http://www.mantamatcher.org/listMatcherImages.jsp'
keyword_url = None
db = 'MantaMatcher'
species = 'manta_ray'
parsed = parse_wildbook(images_url, keyword_url)
download_dir = join('/media/raid/raw/Wildbook/', 'MantaMatcher')
DRY = False
DRY = True
ut.ensuredir(download_dir)
parsed = getshark.postprocess_filenames(parsed, download_dir)
parsed = getshark.postprocess_extfilter(parsed)
parsed = getshark.postprocess_tags_build(parsed)
parsed = getshark.postprocess_tags_filter(parsed)
# Download images that we dont have yet
getshark.download_missing_images(parsed)
if False:
parsed._meta['ignore'].extend(['fname_tags', 'tags', 'orig_fname'])
# Change variable name to info now that we downloaded
parsed_dl = parsed.copy()
parsed_dl = getshark.postprocess_corrupted(parsed_dl)
parsed_dl = getshark.postprocess_uuids(parsed_dl)
# Name change again after time intensive step
unmerged = parsed_dl.copy()
# Squash duplicate images
info = getshark.postprocess_rectify_duplicates(unmerged)
# Check these images against what currently exists in WS_ALL
import wbia
ibs = wbia.opendb(db, allow_newdir=True)
all_images = ibs.images()
num_ia_unique = len(set(all_images.uuids) - set(info['uuid']))
# TODO: Check that all the UUIDs in the IA database are indeed ok
# Determine which items are in the database
info_gid_list = ibs.get_image_gids_from_uuid(info['uuid'])
is_hit = ut.flag_not_None_items(info_gid_list)
is_miss = ut.flag_None_items(info_gid_list)
hit_info = info.compress(is_hit) # NOQA
miss_info = info.compress(is_miss) # NOQA
logger.info('The IA database has %r images' % (len(all_images),))
logger.info(
'The IA database has %r/%r images not from this downloaded set'
% (num_ia_unique, len(all_images))
)
logger.info('Have %d/%d parsed images' % (len(hit_info), len(info_gid_list)))
logger.info('Missing %d/%d parsed images' % (len(miss_info), len(info_gid_list)))
# Add new info
if not DRY:
if len(miss_info):
add_new_images(ibs, miss_info, species)
# REFIND Existing
logger.info('Redoing exist check')
info_gid_list = ibs.get_image_gids_from_uuid(info['uuid'])
is_hit = ut.flag_not_None_items(info_gid_list)
is_miss = ut.flag_None_items(info_gid_list)
hit_info = info.compress(is_hit) # NOQA
miss_info = info.compress(is_miss) # NOQA
logger.info('The IA database has %r images' % (len(all_images),))
logger.info(
'The IA database has %r/%r images not from this downloaded set'
% (num_ia_unique, len(all_images))
)
logger.info('Have %d/%d parsed images' % (len(hit_info), len(info_gid_list)))
logger.info('Missing %d/%d parsed images' % (len(miss_info), len(info_gid_list)))
# Sync existing info
if True:
sync_existing_images(ibs, hit_info, species, DRY)
[docs]def sync_existing_images(ibs, hit_info, species, DRY):
logger.info('Syncing existing images')
import numpy as np
# Get info for items already in database
hit_info['gid'] = ibs.get_image_gids_from_uuid(hit_info['uuid'])
hit_images = ibs.images(hit_info['gid'])
# Sync original_uris
logger.info('Checking uris_original')
ia_prop_list = hit_images.uris_original
wb_prop_list = hit_info['img_url']
dirty_flags = []
for ia_prop, wb_prop in zip(ia_prop_list, wb_prop_list):
if not ut.is_listlike(ia_prop) and ut.is_listlike(wb_prop):
# wb had ambigous values, things are ok if we hit at least one
flag = ia_prop not in wb_prop
else:
flag = ia_prop != wb_prop
dirty_flags.append(flag)
# Use the wildbook urls as original uris
if not any(dirty_flags):
logger.info('...All %d original uris do not need fixing' % len(hit_images))
else:
logger.info(
'...There are %d/%d original uris that need fixing'
% (sum(dirty_flags), len(dirty_flags))
)
dirty_info = hit_info.compress(dirty_flags)
dirty_gids = dirty_info['gid']
dirty_wb_props = dirty_info.map_column(
'img_url', lambda v: ut.ensure_iterable(v)[0]
)
if not DRY:
logger.info('...Fixing %d original uris' % (sum(dirty_flags),))
ibs.set_image_uris_original(dirty_gids, dirty_wb_props, overwrite=True)
else:
logger.info('\n'.join(hit_images.compress(dirty_flags).uris_original))
dirty_info.print(keys='img_url')
# Sync info in annotations
num_annots = np.array(hit_images.num_annotations)
is_empty = num_annots == 0
empty_hit_info = hit_info.compress(is_empty)
empty_hit_images = hit_images.compress(is_empty)
is_single = num_annots == 1
single_hit_info = hit_info.compress(is_single)
single_hit_images = hit_images.compress(is_single)
is_multi = num_annots > 1
multi_hit_info = hit_info.compress(is_multi)
multi_hit_images = hit_images.compress(is_multi)
logger.info('Syncing annot info in images. Checking annots/per/image')
logger.info(' * is_empty = %r' % (is_empty.sum(),))
logger.info(' * is_single = %r' % (is_single.sum(),))
logger.info(' * is_multi = %r' % (is_multi.sum(),))
# We exepect an image to be empty if it has junk in the notes
nonjunk_empty = [n != 'junk' for n in empty_hit_images.notes]
logger.info('empty_hit_images = %r' % (empty_hit_images.gids,))
review_empty = empty_hit_images.compress(nonjunk_empty)
logger.info('need to review %r empty images' % (len(review_empty),))
if len(review_empty) > 0:
empty_hit_info.compress(nonjunk_empty).print()
logger.info('Please manually review empty images %r' % (review_empty.gids,))
# We expect multi images to be tagged with primary / secondary
multi_annots = multi_hit_images._annot_groups
nontagged = [
not all(ut.filterflags_general_tags(t, has_any=['primary', 'secondary']))
for t in multi_annots.case_tags
]
if any(nontagged):
logger.info('Reviewing untagged multi images')
review_multi = multi_hit_images.compress(nontagged)
review_annots = review_multi._annot_groups
logger.info('review_multi = %r' % (review_multi.gids,))
multi_hit_info.compress(nontagged).print(
ignore=[
'img_url',
'new_fpath',
'uuid',
'encounter',
'keywords',
'ext',
'suffix',
'fname_tags',
'orig_fname',
'new_fname',
]
)
# Determine the primary annotation
# Check if any are the entire image
logger.info('Attempting to handle automatically')
img_areas = np.prod(review_multi.sizes, axis=1)
bbox_area_rat = [
np.array(areas) / a for areas, a in zip(review_annots.bbox_area, img_areas)
]
flag = False
for areas, g in zip(bbox_area_rat, review_multi):
if np.any(areas > 0.9):
logger.info('PLEASE CHECK image %r' % (g,))
flag = True
assert not flag, 'need to remove bad annots or change code'
# assert False, 'Need to finish/review this code. Stopped in development'
primary_idxs = [np.argsort(areas)[-1:] for areas in bbox_area_rat]
nonprimary_idxs = [np.argsort(areas)[:-1] for areas in bbox_area_rat]
multi_primary_annots = ibs.annots(
ut.flatten(ut.ziptake(review_annots.aids, primary_idxs))
)
multi_secondary_annots = ibs.annots(
ut.flatten(ut.ziptake(review_annots.aids, nonprimary_idxs))
)
if not DRY:
multi_primary_annots.append_tags('primary')
multi_secondary_annots.append_tags('secondary')
# set tags indicating fg/bg status
# Take primary annots from multi-images
isprimary = [
ut.filterflags_general_tags(t, has_any=['primary'])
for t in multi_annots.case_tags
]
assert all(p.sum() == 1 for p in isprimary), 'should only be one primary'
primary_aids = ut.flatten(ut.zipcompress(multi_annots.aids, isprimary))
# Combine primarys and single_hit into single
single_annots = ibs.annots(ut.flatten(single_hit_images.aids) + primary_aids)
single_info = single_hit_info + multi_hit_info
# Do annot syncing
sync_annot_info(ibs, single_annots, single_info, species, DRY)
[docs]def sync_annot_info(ibs, single_annots, single_info, species, DRY):
"""
sync `info` from wildbook into `annots` from IA.
"""
import numpy as np
# single_info._meta['ignore'] = ['img_url', 'new_fpath', 'uuid', 'encounter']
single_info._meta['ignore'] = ['img_url', 'new_fpath', 'uuid']
# Associate single annots and info using aids (lists should correspond)
single_info['aid'] = single_annots.aids
if True:
# try and set encounter information
key1 = 'encounter'
prop2 = 'static_encounter'
repl2 = ('____', None)
is_set = False
check_annot_disagree(
single_info, single_annots, key1, prop2, repl2, is_set, DRY=DRY
)
# Fix sharks marked as healthy and injured
isbadmark = ut.filterflags_general_tags(
single_annots.case_tags, any_startswith='injur-', has_all='healthy', logic='and'
)
logger.info('Marked %r as both healthy and injured' % (isbadmark.sum()))
if np.any(isbadmark):
bad_fixme = single_annots.compress(isbadmark)
if not DRY:
bad_fixme.remove_tags('healthy')
# cleaned_tags = ut.modify_tags(
# single_info['tags'],
# regex_map=[
# ('view-.*', None)
# ],
# )
# info_injur_tags = parse_injury_categories(single_info['tags'])
# annot_injur_tags = parse_injury_categories(single_annots.case_tags)
# logger.info(ut.repr4(ut.dict_hist(ut.flatten(cleaned_tags))))
# info_injur_tags = [t if len(t) > 0 else ['healthy'] for t in info_injur_tags]
# info_injur_tags = [ut.setdiff(t, ['healthy']) for t in info_injur_tags]
# annot_injur_tags = [ut.setdiff(t, ['healthy']) for t in annot_injur_tags]
# info_injur_tags = [ut.setdiff(t, ['injur-other']) for t in info_injur_tags]
# annot_injur_tags = [ut.setdiff(t, ['injur-other']) for t in annot_injur_tags]
# Remove redundant aliases on IA side
cleaned_tags = ut.modify_tags(
single_annots.case_tags,
direct_map=[
('nicks', 'injur-nicks'),
('scar', 'injur-scar'),
('trunc', 'injur-trunc'),
('injur-trtruunc', 'injur-trunc'),
],
)
# logger.info(ut.repr4(ut.dict_hist(ut.flatten(cleaned_tags))))
single_info['orig_case_tags'] = ut.lmap(sorted, single_annots.case_tags)
single_info['clean_case_tags'] = ut.lmap(sorted, cleaned_tags)
check_annot_disagree(
single_info,
single_annots,
key1='clean_case_tags',
prop2=None,
repl2=(None, []),
is_set=True,
key2='orig_case_tags',
DRY=DRY,
)
isdirty = [
x != y
for x, y in zip(single_info['orig_case_tags'], single_info['clean_case_tags'])
]
dirty_info = single_info.compress(isdirty)
logger.info('removing redundant info from %r annots' % (len(dirty_info),))
if not DRY:
# dirty_info['orig_case_tags']
ibs.overwrite_annot_case_tags(dirty_info['aid'], dirty_info['clean_case_tags'])
# Setup injury tags
info_injur_tags = get_injured_tags(single_info['tags'])
# annot_injur_tags = get_injured_tags(single_annots.case_tags)
single_info['injur_tags'] = info_injur_tags
# single_info['annot_tags'] = annot_injur_tags
# # Fix injury tags
# key1 = 'injur_tags'
# prop2 = None
# key2 = 'annot_tags'
# repl2 = (None, [])
# is_set = True
# check_annot_disagree(single_info, single_annots, key1, None, repl2, is_set,
# key2=key2, DRY=DRY)
# We should just be able to do a union on the two sets.
isdirty = [
not ut.issubset(t1, t2)
for t1, t2 in zip(single_info['injur_tags'], single_annots.case_tags)
]
new_injurtags = [
ut.setdiff(t1, t2)
for t1, t2 in zip(single_info['injur_tags'], single_annots.case_tags)
]
single_info['new_injurtags'] = new_injurtags
dirty_info = single_info.compress(isdirty)
logger.info('new injur tags' + ut.repr4(ut.dict_hist(ut.flatten(new_injurtags))))
logger.info(
'unioning new injur tags into %r/%r annots' % (sum(isdirty), len(isdirty))
)
if not DRY:
ibs.append_annot_case_tags(dirty_info['aid'], dirty_info['new_injurtags'])
# Append all other keywords as well
cleaned_keywords = ut.modify_tags(single_info['keywords'], direct_map=[('', None)])
single_info['new_keywords'] = [
ut.setdiff(t1, t2) for t1, t2 in zip(cleaned_keywords, single_annots.case_tags)
]
isdirty = [len(t) > 0 for t in single_info['new_keywords']]
dirty_info = single_info.compress(isdirty)
logger.info(
'new_keywords' + ut.repr4(ut.dict_hist(ut.flatten(single_info['new_keywords'])))
)
logger.info(
'unioning new_keywords into %r/%r annots' % (len(dirty_info), len(isdirty))
)
if not DRY:
ibs.append_annot_case_tags(dirty_info['aid'], dirty_info['new_keywords'])
# Check if any other tags need appending
cleaned_tags = ut.modify_tags(single_info['tags'], direct_map=[('', None)])
single_info['new_tags'] = [
ut.setdiff(t1, t2) for t1, t2 in zip(cleaned_tags, single_annots.case_tags)
]
isdirty = [len(t) > 0 for t in single_info['new_tags']]
dirty_info = single_info.compress(isdirty)
logger.info('new_tags' + ut.repr4(ut.dict_hist(ut.flatten(single_info['new_tags']))))
logger.info('unioning new_tags into %r annots' % (len(dirty_info),))
if not DRY:
ibs.append_annot_case_tags(dirty_info['aid'], dirty_info['new_tags'])
# Setup viewpoint
mapping = [
('view-left', 'left'),
('view-right', 'right'),
('view-back', 'back'),
]
single_info['viewpoint_code'] = [None] * len(single_info)
for tag, yaw_text in mapping:
tag_flags = ut.filterflags_general_tags(single_info['tags'], has_any=[tag])
# setup yaw info
for idx in ut.where(tag_flags):
single_info['viewpoint_code'][idx] = yaw_text
# Fix Viewpoint
key1 = 'viewpoint_code'
prop2 = 'viewpoint_code'
repl2 = ('____', None)
is_set = False
check_annot_disagree(single_info, single_annots, key1, prop2, repl2, is_set, DRY=DRY)
# Fix Names
key1 = 'nameid'
prop2 = 'names'
repl2 = ('____', None)
is_set = False
check_annot_disagree(single_info, single_annots, key1, prop2, repl2, is_set, DRY=DRY)
# Fix Species
bad_flags = [s == '____' for s in single_annots.species]
_annots = single_annots.compress(bad_flags)
logger.info('%d/%d annots need fixed species' % (sum(bad_flags), len(single_annots)))
if not DRY:
_annots.species = [species] * len(_annots)
# Move injured/healthy/untagged to appropriate sets
injur_tags = get_injured_tags(single_annots.case_tags, include_healthy=True)
untagged = np.array(ut.lmap(len, injur_tags)) == 0
untagged_annots = single_annots.compress(untagged)
untagged_info = single_info.compress(untagged)
logger.info('%d/%d annots have no tags' % (len(untagged_annots), len(single_annots)))
logger.info(
'Tags from WB imgnames:'
+ ut.repr3(ut.dict_hist(ut.flatten(untagged_info['tags'])))
)
untagged_images = ibs.images(untagged_annots.gids)
# Add healthy tag to anything without an injured tag
if not DRY:
logger.info('Adding healthy tag to sharked not taged as injured')
ibs.append_annot_case_tags(
untagged_annots.aids, ['healthy'] * len(untagged_annots.aids)
)
if not DRY:
untagged_images.append_to_imageset('Untagged')
categories = get_injur_categories(single_annots.case_tags)
healthy_flags = ut.filterflags_general_tags(categories, any_startswith=('injur-'))
injured_flags = ut.filterflags_general_tags(
single_annots.case_tags, has_any=['healthy']
)
num_have = sum(ut.xor_lists(healthy_flags, injured_flags))
num_miss = len(single_annots) - num_have
logger.info('missing %d annots' % (num_miss,))
injured_annots = single_annots.compress(healthy_flags)
injured_images = ibs.images(ut.unique(injured_annots.gids))
healthy_annots = single_annots.compress(injured_flags)
healthy_images = ibs.images(ut.unique(healthy_annots.gids))
if not DRY:
#
injured_images.remove_from_imageset('Probably Healthy')
healthy_images.remove_from_imageset('Probably Injured')
#
injured_images.append_to_imageset('Probably Injured')
healthy_images.append_to_imageset('Probably Healthy')
[docs]def check_annot_disagree(
single_info, single_annots, key1, prop2, repl2, is_set, key2=None, DRY=True
):
info_prop = single_info[key1]
if key2 is None:
key2 = 'annot_' + prop2
annot_prop = getattr(single_annots, prop2)
annot_prop = [repl2[1] if p == repl2[0] else p for p in annot_prop]
single_info[key2] = annot_prop
else:
annot_prop = single_info[key2]
out_of_sync = [x != y for x, y in zip(info_prop, annot_prop)]
if not is_set:
def isnull(z):
return z is None
else:
def isnull(z):
return len(z) == 0
ia_empty = [isnull(y) and not isnull(x) for x, y in zip(info_prop, annot_prop)]
wb_empty = [isnull(x) and not isnull(y) for x, y in zip(info_prop, annot_prop)]
disagree = [
x != y and not isnull(x) and not isnull(y) for x, y in zip(info_prop, annot_prop)
]
if is_set:
# Like empty, but ia is a pure subset of wb
ia_is_subset = [
d and ut.issubset(y, x) for x, y, d in zip(info_prop, annot_prop, disagree)
]
wb_is_subset = [
d and ut.issubset(x, y) for x, y, d in zip(info_prop, annot_prop, disagree)
]
# There may not be a subset, but there is overlap?
some_isect = [
d and len(ut.isect(y, x)) > 0
for x, y, d in zip(info_prop, annot_prop, disagree)
]
some_isect = ut.and_lists(some_isect, ut.not_list(ia_is_subset))
some_isect = ut.and_lists(some_isect, ut.not_list(wb_is_subset))
# Absolutely no overlap
total_disagree = [
d and len(ut.isect(y, x)) == 0
for x, y, d in zip(info_prop, annot_prop, disagree)
]
logger.info('\n--- RECTIFY prop=%r --- ' % (key1,))
logger.info(
'Prop=%r has %r/%r out of sync items' % (key1, sum(out_of_sync), len(out_of_sync))
)
logger.info(
'WB has populated info for %r/%r %r' % (sum(ia_empty), len(ia_empty), key1)
)
logger.info(
'IA has populated info for %r/%r %r' % (sum(wb_empty), len(wb_empty), key1)
)
logger.info(
'IA and WB disagree on info for %r/%r %r' % (sum(disagree), len(disagree), key1)
)
if is_set:
logger.info(
'IA is subset of WB info for %r/%r %r'
% (sum(ia_is_subset), len(ia_is_subset), key1)
)
logger.info(
'WB is subset of IA info for %r/%r %r'
% (sum(wb_is_subset), len(wb_is_subset), key1)
)
logger.info(
'WB and IA partial overlap info for %r/%r %r'
% (sum(some_isect), len(some_isect), key1)
)
logger.info(
'IA and WB total disagree on info for %r/%r %r'
% (sum(total_disagree), len(total_disagree), key1)
)
sub_info = single_info.take_column('gid', 'aid', key1, key2)
sub_info._meta['ignore'] = []
logger.info('\n--- DISAGREE DETAILS ---')
logger.info('IA POPULATED (updates on IA side?)')
# Do nothing about these
sub_info.compress(wb_empty).print()
logger.info('WB POPULATED: (can give)')
# Pull info from wildbook
sub_info.compress(ia_empty).print()
if is_set:
logger.info('IA subset WB (can give)')
sub_info.compress(ia_is_subset).print()
logger.info('WB subset IA (updates on IA side?)')
sub_info.compress(wb_is_subset).print()
logger.info('SOME OVERLAP')
# Have to manually fix
sub_info.compress(some_isect).print()
logger.info('TOTAL DISAGREE')
# Have to manually fix
sub_info.compress(total_disagree).print()
else:
logger.info('DISAGREE')
# Have to manually fix
sub_info.compress(disagree).print()
# Get which annots need modification.
if is_set:
flags = ut.or_lists(ia_empty, ia_is_subset)
else:
# We can move populated info from wildbook into empty wbia info
flags = ia_empty
new_info = sub_info.compress(flags)
old_annots = single_annots.compress(flags)
if not is_set:
# Ensure that there is no ambiguity
isambiguous = [ut.isscalar(v) for v in new_info[key1]]
notok = sum(ut.not_list(isambiguous))
assert notok == 0
logger.info('There are %d ambiguous properties from wildbook' % (notok,))
new_info = new_info.compress(isambiguous)
old_annots = old_annots.compress(isambiguous)
new_info = sub_info.compress(flags)
old_annots = single_annots.compress(flags)
new_prop = new_info[key1]
if not is_set and len(ut.unique(new_prop)) < 20:
logger.info('new prop hist')
logger.info(ut.repr3(ut.dict_hist(new_prop)))
elif is_set and len(ut.unique(ut.flatten(new_prop))) < 20:
logger.info('new prop hist')
logger.info(ut.repr3(ut.tag_hist(new_prop)))
if not DRY:
logger.info('MODIFYING PROPERTEIS')
if len(old_annots) > 0:
if is_set:
assert prop2 is None
assert key1 == 'injur_tags', 'hack is invalid. got={}'.format(key1)
old_annots.append_tags(new_prop)
else:
setattr(old_annots, prop2, new_prop)
else:
logger.info('dryrun')
[docs]def get_injur_categories(single_annots, verbose=False):
# if verbose:
# logger.info('Original tags')
# logger.info(ut.repr3(ut.tag_hist(injur_tags)))
if isinstance(single_annots, list):
case_tags = single_annots
aids = list(range(len(single_annots)))
else:
case_tags = single_annots.case_tags
aids = single_annots.aids
injur_tags = get_injured_tags(case_tags, include_healthy=True)
cleaned_tags, alias_map, unmapped = ut.modify_tags(
injur_tags,
regex_map=[
# Invalid patterns
('^.*' + re.escape('?') + '$', None),
# Truncation
('injur-trunc', 'injur-trunc'),
('trunc', 'injur-trunc'),
# Gill damage
('.*gilldamage.*', 'injur-gill'),
# Other
('injur-unknown', 'injur-other'),
('injur-dead', 'injur-other'),
('other_injury', 'injur-other'),
('injur-damage', 'injur-other'),
('injured', 'injur-other'),
('^injur$', 'injur-other'),
# Nicks
('nicks', 'injur-nicks'),
('injur-nicks-.*', 'injur-nicks'),
('.*bite', 'injur-bite'),
('.*scar', 'injur-scar'),
],
direct_map=[
('injur-trunc', 'injur-trunc'),
('injur-scar', 'injur-scar'),
('injur-other', 'injur-other'),
('injur-nicks', 'injur-nicks'),
('injur-bite', 'injur-bite'),
('healthy', 'healthy'),
],
return_unmapped=True,
return_map=True,
delete_unmapped=True,
)
assert len(unmapped) == 0, 'fixme %r' % (unmapped,)
# Remove injur-other if other known injuries are present
def fixinjur(aid, tags):
tags = sorted(ut.unique(tags))
injured = any([t.startswith('injur-') for t in tags])
if injured:
if 'healthy' in tags:
logger.info(
'shark aid=%r labeled as injured and healty %r!!!' % (aid, tags)
)
if len(tags) == 0:
return tags
tags = ut.setdiff(tags, ['injur-other'])
if injured and len(tags) == 0:
tags = ['injur-other']
tags = ut.setdiff(tags, ['injur-gill'])
if injured and len(tags) == 0:
tags = ['injur-gill']
# if len(tags) == 1:
# tags = ut.setdiff(tags, ['healthy'])
return tags
cleaned_tags = [fixinjur(aid, tags) for aid, tags in zip(aids, cleaned_tags)]
if verbose:
logger.info(
'mapping: ' + ut.repr3(ut.group_items(alias_map.keys(), alias_map.values()))
)
logger.info('unmapped = %s' % (ut.repr3(unmapped),))
given_tags = set(ut.flatten(injur_tags))
alias_map_used = ut.odict()
for val, key in alias_map.items():
if val in given_tags:
alias_map_used[val] = key
logger.info(
'used_mapping: '
+ ut.repr3(ut.group_items(alias_map_used.keys(), alias_map_used.values()))
)
logger.info('Cleaned tags')
hist = ut.tag_hist(cleaned_tags)
logger.info(ut.repr3(hist))
# Get tag co-occurrence
logger.info('Co-Occurrence Freq')
co_occur = ut.tag_coocurrence(cleaned_tags)
logger.info(ut.repr3(co_occur))
logger.info('Co-Occurrence Percent')
co_occur_percent = ut.odict(
[
(keys, [100 * val / hist[k] for k in keys])
for keys, val in co_occur.items()
]
)
logger.info(ut.repr3(co_occur_percent, precision=2, nl=1))
# other_annots = single_annots.compress(ut.filterflags_general_tags(cleaned_tags, has_any=['injur-other']))
# logger.info('other_annots.case_tags = %s' % (ut.repr4(list(zip(other_annots.gids, other_annots.aids, other_annots.case_tags)), nl=1),))
return cleaned_tags
[docs]def add_new_images(ibs, miss_info, species):
import numpy as np
isambiguous = miss_info.map_column('new_fpath', ut.isiterable)
assert not any(isambiguous), 'Cannot add ambiguous filenames'
# Add images to IA to get a gid
gid_list = ibs.add_images(miss_info['new_fpath'])
miss_info['gid'] = gid_list
# Check to see if adding any images failed
failed_flags = ut.flag_None_items(miss_info['gid'])
logger.info('# failed to add %s images' % (sum(failed_flags),))
passed_flags = ut.not_list(failed_flags)
miss_info = miss_info.compress(passed_flags)
ut.assert_all_not_None(miss_info['gid'])
# ibs.get_image_uris_original(clist['gid'])
assert (
len(ut.find_duplicate_items(miss_info['gid'])) == 0
), 'duplicates should have already been sorted out'
# Just choose one of the urls if any are ambiguous
orig_urls = miss_info.map_column('img_url', lambda v: ut.ensure_iterable(v)[0])
ibs.set_image_uris_original(miss_info['gid'], orig_urls, overwrite=True)
logger.info('Add new images to temporary imagesets')
images_new = ibs.images(miss_info['gid'])
new_imgsettext = 'New Images ' + ut.get_timestamp()
images_new.append_to_imageset(new_imgsettext)
injured_keywords = get_injured_tags(miss_info['tags'])
hasinjur_kw = ut.lmap(bool, injured_keywords)
images_new.compress(hasinjur_kw).append_to_imageset(new_imgsettext + ' Injur')
images_new.compress(ut.not_list(hasinjur_kw)).append_to_imageset(
new_imgsettext + ' Healthy'
)
verbose = True
if verbose:
other_keywords = get_injured_tags(miss_info['tags'], invert=True)
logger.info('Added %r new images' % (len(miss_info)))
logger.info('Of these, %r images had injured tags' % (sum(hasinjur_kw)))
logger.info(
'Of these, %r images had other tags' % (sum(ut.lmap(bool, other_keywords)))
)
logger.info(
'Of these, %r images had no injured tags'
% (len(miss_info) - sum(ut.lmap(bool, injured_keywords)))
)
# injured_keyhist = ut.dict_hist(ut.flatten(injured_keywords), ordered=True)
# other_keyhist = ut.dict_hist(ut.flatten(other_keywords), ordered=True)
# logger.info('')
# logger.info('Injured Keyword histogram:\n' + ', '.join(
# ['*%s*: %s' % (k, v) for k, v in injured_keyhist.items()][::-1]))
# logger.info('')
# logger.info('Other Keyword histogram:\n' + ', '.join(
# ['*%s*: %s' % (k, v) for k, v in other_keyhist.items()][::-1]))
is_empty_annots = np.array(images_new.num_annotations) == 0
# Add anotations to images
empty_new_info = miss_info.compress(is_empty_annots)
empty_new_images = images_new.compress(is_empty_annots)
# DETECT ANNOTATIONS ON NEW IMAGES
if ibs.dbname == 'WS_ALL':
# In the best case we have a detector
config = {
'algo': 'yolo',
'sensitivity': 0.2,
'config_filepath': ut.truepath(
'~/work/WS_ALL/localizer_backup/detect.yolo.2.cfg'
),
'weight_filepath': ut.truepath(
'~/work/WS_ALL/localizer_backup/detect.yolo.2.39000.weights'
),
'class_filepath': ut.truepath(
'~/work/WS_ALL/localizer_backup/detect.yolo.2.cfg.classes'
),
}
depc = ibs.depc_image
images = ibs.images(empty_new_images.gids)
images = images.compress([ext_ not in ['.gif'] for ext_ in images.exts])
gid_list = images.gids
# result is a tuple: (score, bbox_list, theta_list, conf_list, class_list)
results_list = depc.get_property(
'localizations', gid_list, None, config=config
) # NOQA
logger.info('Finished running localizations')
results_list2 = []
multi_gids = []
failed_gids = []
for gid, res in zip(gid_list, results_list):
score, bbox_list, theta_list, conf_list, class_list = res
if len(bbox_list) == 0:
failed_gids.append(gid)
elif len(bbox_list) == 1:
results_list2.append((gid, bbox_list, theta_list))
elif len(bbox_list) > 1:
# Take only a single annotation per bounding box.
multi_gids.append(gid)
idx = conf_list.argmax()
res2 = (gid, bbox_list[idx : idx + 1], theta_list[idx : idx + 1])
results_list2.append(res2)
logger.info('%d/%d have localizations' % (len(results_list2), len(results_list)))
logger.info(
'%d/%d are missing localizations' % (len(failed_gids), len(results_list))
)
logger.info(
'%d/%d had multiple localizations' % (len(multi_gids), len(results_list))
)
# Add these to an imageset for fixing
ibs.images(failed_gids).append_to_imageset('NoLocs' + new_imgsettext)
ibs.images(multi_gids).append_to_imageset('MultiLocs' + new_imgsettext)
# Reorder empty_info to be aligned with results
localized_imgs = ibs.images(ut.take_column(results_list2, 0))
empty_new_info_ = empty_new_info.loc_by_key('gid', localized_imgs.gids)
assert all(
[len(a) == 0 for a in localized_imgs.aids]
), 'no annots should be made yet'
# Override old bboxes
annot_gids = localized_imgs.gids
annot_bboxes = np.array(ut.take_column(results_list2, 1))[:, 0, :]
annot_thetas = np.array(ut.take_column(results_list2, 2))[:, 0]
# Fix any ambiguities for name
annot_names = empty_new_info_.map_column(
'nameid', lambda v: ut.ensure_iterable(v)[0]
)
annot_names = ut.replace_nones(annot_names, ibs.const.UNKNOWN)
# annot_names = empty_new_info_['nameid']
annot_species = [species] * len(localized_imgs)
else:
# Make a single annotation for each image in the worst case
annot_gids = empty_new_images.gids
annot_bboxes = [(1, 1, w - 2, h - 2) for w, h in empty_new_images.sizes]
annot_thetas = [0] * len(annot_gids)
annot_names = empty_new_info.loc_by_key('gid', annot_gids)['nameid']
annot_names = ut.replace_nones(annot_names, ibs.const.UNKNOWN)
annot_species = [species] * len(annot_gids)
aid_list = ibs.add_annots(
annot_gids,
bbox_list=annot_bboxes,
theta_list=annot_thetas,
name_list=annot_names,
species_list=annot_species,
)
logger.info('Finished adding new info')
return aid_list
def _needs_redownload(fpath, seconds_thresh):
if exists(fpath):
file_info = ut.get_file_info(fpath)
dt = ut.parse_timestamp(file_info['last_modified'], zone='UTC')
delta = dt - ut.utcnow_tz()
redownload = delta.total_seconds() > seconds_thresh
else:
redownload = True
return redownload
[docs]def parse_whaleshark_org():
"""
Read list of all images from wildbook
Combines old and new
>>> from wbia.scripts.getshark import * # NOQA
"""
from wbia.scripts import getshark
parsed1 = getshark.parse_whaleshark_org_old()
# Also parse using the keyword method
parsed2 = getshark.parse_whaleshark_org_keywords()
logger.info('Parsed %d urls from XML jsp' % (len(parsed1),))
logger.info('Parsed %d urls from keywords' % (len(parsed2),))
# Apply keywords to existing images
# raise NotImplementedError('suffix is now unreliable for comparing encounters')
# Use suffix as a key to create a merger mapping between indices
logger.info('Merging keyword and XML jsp results')
suffix_to_idx1 = ut.make_index_lookup(parsed1['suffix'])
suffix_to_idx2 = ut.make_index_lookup(parsed2['suffix'])
idx1_to_idx2 = ut.dict_take(suffix_to_idx2, parsed1['suffix'], None)
idx2_to_idx1 = ut.dict_take(suffix_to_idx1, parsed2['suffix'], None)
# Find the items that are unique to each set
unmatched_idx1 = ut.where(ut.not_list(idx1_to_idx2))
unmatched_idx2 = ut.where(ut.not_list(idx2_to_idx1))
logger.info('There are %d unique entries in the XML results' % (len(unmatched_idx1),))
logger.info('There are %d unique entries in the jsp results' % (len(unmatched_idx2),))
# nonmatching1 = parsed1.take(unmatched_idx1)
# nonmatching2 = parsed2.take(unmatched_idx2)
# Find the items that are common between both sets
match_idx1 = ut.filter_Nones(idx2_to_idx1)
match_idx2 = ut.filter_Nones(idx1_to_idx2)
# matching1 = parsed1.take(match_idx1)
# matching2 = parsed2.take(match_idx2)
assert len(match_idx1) == len(match_idx2)
logger.info('There are %d items in common' % (len(match_idx1),))
# Make columns agree between parsed1 and parsed2
del parsed1['localid']
parsed1['uuid'] = [None] * len(parsed1)
parsed1['keywords'] = [[] for _ in range(len(parsed1))]
ut.setdiff(parsed2.keys(), parsed1.keys())
ut.setdiff(parsed1.keys(), parsed2.keys())
parsed = parsed2 + parsed1
parsed.cast_column('keywords', ut.oset)
parsed.cast_column('new_fname', ut.ensure_iterable)
parsed.cast_column('img_url', ut.ensure_iterable)
parsed.cast_column('encounter', ut.ensure_iterable)
parsed = parsed.merge_rows('suffix', merge_scalars=False)
parsed.cast_column('keywords', list)
parsed.cast_column('new_fname', lambda v: v[0])
parsed.cast_column('img_url', lambda v: v[0])
parsed.cast_column('encounter', lambda v: v[0])
if True:
parsed._meta['ignore'] = ['new_fname', 'img_url', 'suffix']
parsed.print()
# nonmatching1['nameid'] = [None] * len(nonmatching1)
# nonmatching1['localid'] = [None] * len(nonmatching1)
# Merge keywords from matching parts in parsed2 into parsed1
# parsed1['keywords'] = [[] for _ in range(len(parsed1))]
# for idx1, keys in zip(match_idx1, matching2['keywords']):
# parsed1['keywords'][idx1].extend(keys)
# parsed = parsed2 + nonmatching1
logger.info('Parsed %d total urls' % (len(parsed),))
return parsed
[docs]def parse_whaleshark_org_old():
url = 'www.whaleshark.org/listImages.jsp'
parsed1 = parse_wildbook_images(url)
return parsed1
[docs]def parse_wildbook(images_url, keyword_url=None):
"""
Read list of all images from wildbook
Combines old and new
Example:
>>> # DISABLE_DOCTEST
>>> from wbia.scripts.getshark import * # NOQA
>>> url = images_url = 'http://www.mantamatcher.org/listImages.jsp'
Example:
>>> # DISABLE_DOCTEST
>>> images_url = 'http://www.whaleshark.org/listImages.jsp'
>>> keyword_url = 'http://www.whaleshark.org/getKeywordImages.jsp'
"""
from wbia.scripts import getshark
parsed1 = getshark.parse_wildbook_images(images_url)
# Also parse using the keyword method
# parsed2 = getshark.parse_wildbook_keywords(keyword_url)
logger.info('Parsed %d urls from XML jsp' % (len(parsed1),))
# if keyword_url:
# logger.info('Parsed %d urls from keywords' % (len(parsed2),))
# Apply keywords to existing images
# raise NotImplementedError('suffix is now unreliable for comparing encounters')
# Use suffix as a key to create a merger mapping between indices
# logger.info('Merging keyword and XML jsp results')
# suffix_to_idx1 = ut.make_index_lookup(parsed1['suffix'])
# suffix_to_idx2 = ut.make_index_lookup(parsed2['suffix'])
# idx1_to_idx2 = ut.dict_take(suffix_to_idx2, parsed1['suffix'], None)
# idx2_to_idx1 = ut.dict_take(suffix_to_idx1, parsed2['suffix'], None)
# Find the items that are unique to each set
# unmatched_idx1 = ut.where(ut.not_list(idx1_to_idx2))
# unmatched_idx2 = ut.where(ut.not_list(idx2_to_idx1))
# logger.info('There are %d unique entries in the XML results' % (len(unmatched_idx1),))
# logger.info('There are %d unique entries in the jsp results' % (len(unmatched_idx2),))
# nonmatching1 = parsed1.take(unmatched_idx1)
# nonmatching2 = parsed2.take(unmatched_idx2)
# Find the items that are common between both sets
# match_idx1 = ut.filter_Nones(idx2_to_idx1)
# match_idx2 = ut.filter_Nones(idx1_to_idx2)
# assert len(match_idx1) == len(match_idx2)
# logger.info('There are %d items in common' % (len(match_idx1),))
# Make columns agree between parsed1 and parsed2
del parsed1['localid']
parsed1['uuid'] = [None] * len(parsed1)
parsed1['keywords'] = [[] for _ in range(len(parsed1))]
parsed = parsed1
parsed.cast_column('keywords', ut.oset)
parsed.cast_column('new_fname', ut.ensure_iterable)
parsed.cast_column('img_url', ut.ensure_iterable)
parsed.cast_column('encounter', ut.ensure_iterable)
parsed = parsed.merge_rows('suffix', merge_scalars=False)
parsed.cast_column('keywords', list)
parsed.cast_column('new_fname', lambda v: v[0])
parsed.cast_column('img_url', lambda v: v[0])
parsed.cast_column('encounter', lambda v: v[0])
if True:
parsed._meta['ignore'] = ['new_fname', 'img_url', 'suffix']
parsed.print()
# nonmatching1['nameid'] = [None] * len(nonmatching1)
# nonmatching1['localid'] = [None] * len(nonmatching1)
# Merge keywords from matching parts in parsed2 into parsed1
# parsed1['keywords'] = [[] for _ in range(len(parsed1))]
# for idx1, keys in zip(match_idx1, matching2['keywords']):
# parsed1['keywords'][idx1].extend(keys)
# parsed = parsed2 + nonmatching1
logger.info('Parsed %d total urls' % (len(parsed),))
return parsed
[docs]def parse_wildbook_images(url):
"""
Example:
>>> # DISABLE_DOCTEST
>>> url = 'www.whaleshark.org/listImages.jsp'
>>> url = images_url = 'http://www.mantamatcher.org/listImages.jsp'
>>> parse_wildbook_images(url)
"""
from xml.dom.minidom import parseString
from wbia.scripts import getshark
number = None
cache_dpath = ut.ensure_app_resource_dir('utool', 'sharkinfo')
cache_fpath = join(cache_dpath, ut.hash_data(url) + '.xml')
logger.info('cache_fpath = {!r}'.format(cache_fpath))
# redownload every 30 days or so
if getshark._needs_redownload(cache_fpath, 60 * 60 * 24 * 30):
XMLdata = ut.url_read_text(url)
ut.writeto(cache_fpath, XMLdata)
else:
XMLdata = ut.readfrom(cache_fpath)
# Parse attributes out of XML
dom = parseString(XMLdata.encode('utf8'))
if number:
maxCount = min(number, len(dom.getElementsByTagName('img')))
else:
maxCount = len(dom.getElementsByTagName('img'))
parsed_info = ut.ddict(list)
logger.info('Reading XML information from %d images...' % maxCount)
shark_elements = dom.getElementsByTagName('shark')
_prog = ut.ProgPartial(bs=True, freq=10)
for shark in _prog(shark_elements, lbl='parsing shark elements'):
localCount = 0
for encounter in shark.getElementsByTagName('encounter'):
for img in encounter.getElementsByTagName('img'):
localCount += 1
img_url = img.getAttribute('href')
ext = splitext(img_url)[1].lower()
nameid = shark.getAttribute('number')
new_fname = '%s-%i%s' % (nameid, localCount, ext)
parsed_info['img_url'].append(img_url)
parsed_info['nameid'].append(nameid)
parsed_info['localid'].append(localCount)
# might be different due to prefix
parsed_info['new_fname'].append(new_fname)
parsed_info['encounter'].append(encounter.getAttribute('number'))
# logger.info('Parsed %i / %i files.' % (len(parsed_info['orig_fname']), maxCount))
if number is not None and len(parsed_info['orig_fname']) == number:
break
parsed_ = ut.ColumnLists(parsed_info)
logger.info('Parsed %d urls from XML jsp' % (len(parsed_),))
# Fix trivial (all non-keyword entries are the same) duplicates
parsed_.cast_column('localid', lambda x: ut.oset(ut.ensure_iterable(x)))
parsed_.cast_column('new_fname', lambda x: ut.oset(ut.ensure_iterable(x)))
parsed1 = parsed_.merge_rows('img_url', merge_scalars=False)
parsed1.cast_column('localid', lambda x: list(x)[0])
parsed1.cast_column('new_fname', lambda x: list(x)[0])
# Check and rectify for duplicate urls
# Check and rectify for duplicate urls
# unique_urls, idxs = parsed_.group_indicies('img_url')
# toremove = []
# for url, idxs in zip(unique_urls, idxs):
# if len(idxs) == 1:
# continue
# dupinfo = parsed_.take(idxs)
# del dupinfo[['localid', 'new_fname', 'img_url']]
# can_fix = True
# for key, vals in dupinfo.asdict().items():
# if not ut.allsame(vals):
# logger.info(dupinfo.to_csv())
# logger.info(('Duplicate items have different values'))
# # May need to fix a case when annoations happen in WB
# assert False, 'cant have this happen'
# can_fix = False
# if can_fix:
# toremove += idxs[1:]
# logger.info('Removing %d duplicate urls' % (len(toremove),))
# flags = ut.not_list(ut.index_to_boolmask(toremove))
# parsed1 = parsed_.compress(flags)
prefix = commonprefix(parsed1['img_url'])
parsed1['suffix'] = [url_[len(prefix) :] for url_ in parsed1['img_url']]
return parsed1
[docs]def parse_whaleshark_org_keywords():
verbose = True
if verbose:
logger.info('[keywords] Parsing whaleshark.org keywords')
# if False:
# key_url = 'http://www.whaleshark.org/getKeywordImages.jsp?indexName=nofilter&maxSize=2'
# #import requests
# #resp = requests.get(url)
# #resp.json()
# # key_url = 'http://www.whaleshark.org/getKeywordImages.jsp?indexName=nofilter'
# # json = cached_json_request(key_url)
# # ut.save_data('nofilterwbquery.pkl', json)
# #url = 'http://www.whaleshark.org/getKeywordImages.jsp?indexName=truncationleftpec&maxSize=2'
# #import requests
# #resp = requests.get(url)
# #resp.json()
from wbia.scripts import getshark
url = 'http://www.whaleshark.org/getKeywordImages.jsp'
cache_dpath = ut.ensure_app_resource_dir('utool', 'sharkinfo3')
def cached_json_request(key_url):
import requests
cache_fpath = join(cache_dpath, 'req_' + ut.hashstr27(key_url) + '.json')
if getshark._needs_redownload(cache_fpath, 60 * 60 * 24 * 3000):
logger.info('Execute request %s' % (key_url,))
resp = requests.get(key_url)
logger.info('Got Response')
assert resp.status_code == 200
dict_ = resp.json()
ut.save_data(cache_fpath, dict_)
else:
dict_ = ut.load_data(cache_fpath)
return dict_
# Read all keyywords
keywords = cached_json_request(url)['keywords']
key_list = ut.take_column(keywords, 'indexName')
if verbose:
logger.info('[keywords] Keyword indexName:')
logger.info(ut.indent('\n'.join(sorted(key_list)), '* '))
# Request all images belonging to each keyword
request_results = {}
for key in ut.ProgIter(key_list + ['nofilter'], lbl='reading index', bs=False):
key_url = url + '?indexName={indexName}'.format(indexName=key)
request_results[key] = cached_json_request(key_url)
keyed_images = {}
for key, val in request_results.items():
keyed_images[key] = val['images']
# Flatten nested structure into ColumnList (note this will cause img_url duplicates)
parsed_info2 = ut.ddict(list)
for key, images in keyed_images.items():
for imgdict in images:
parsed_info2['img_url'].append(imgdict['url'])
parsed_info2['encounter'].append(imgdict['correspondingEncounterNumber'])
parsed_info2['nameid'].append(imgdict.get('individualID', None))
parsed_info2['uuid'].append(imgdict['uuid'])
parsed_info2['keywords'].append(imgdict['keywords'])
# parsed_info2['keywords'].append([key])
parsed2_ = ut.ColumnLists(parsed_info2)
# Fix trivial (all non-keyword entries are the same) duplicates
parsed2_.cast_column('keywords', ut.oset)
parsed2 = parsed2_.merge_rows('img_url', merge_scalars=False)
parsed2.cast_column('keywords', list)
assert len(parsed2.get_multis('uuid')) == 0, 'uuids must be unique'
if verbose:
injured_keywords = getshark.get_injured_tags(parsed2['keywords'])
other_keywords = getshark.get_injured_tags(parsed2['keywords'], invert=True)
injured_keyhist = ut.dict_hist(ut.flatten(injured_keywords), ordered=True)
other_keyhist = ut.dict_hist(ut.flatten(other_keywords), ordered=True)
logger.info('Scraped %r images with keywords' % (len(parsed2)))
logger.info(
'Of these, %r images had injured tags'
% (sum(ut.lmap(bool, injured_keywords)))
)
logger.info(
'Of these, %r images had other tags' % (sum(ut.lmap(bool, other_keywords)))
)
logger.info(
'Of these, %r images had no injured tags'
% (len(parsed2) - sum(ut.lmap(bool, injured_keywords)))
)
logger.info('')
logger.info(
'Injured Keyword histogram:\n'
+ ', '.join(['*%s*: %s' % (k, v) for k, v in injured_keyhist.items()][::-1])
)
logger.info('')
logger.info(
'Other Keyword histogram:\n'
+ ', '.join(['*%s*: %s' % (k, v) for k, v in other_keyhist.items()][::-1])
)
# Get tag co-occurrence
logger.info('Injur Keywords Co-Occurrence Freq')
co_occur = ut.tag_coocurrence(injured_keywords)
logger.info(ut.repr3(co_occur))
logger.info('Num co-occurrences: %r' % (sum(co_occur.values())))
logger.info('Injur Keywords Co-Occurrence Percent')
co_occur_percent = ut.odict(
[
(keys, [100 * val / injured_keyhist[k] for k in keys])
for keys, val in co_occur.items()
]
)
logger.info(ut.repr3(co_occur_percent, precision=2, nl=1))
_ = getshark.get_injur_categories(injured_keywords, verbose=True) # NOQA
prefix = commonprefix(parsed2['img_url'])
parsed2['suffix'] = [url_[len(prefix) :] for url_ in parsed2['img_url']]
# Hack off encounters so it aggrees with parsed1
parsed2['suffix'] = [url_.lstrip('encounters/') for url_ in parsed2['suffix']]
parsed2['new_fname'] = [suffix.replace('/', '--') for suffix in parsed2['suffix']]
assert len(parsed2.get_multis('suffix')) == 0, 'hack invalidated something'
return parsed2
[docs]def postprocess_filenames(parsed, download_dir):
from os.path import commonprefix, basename # NOQA
# Create a new filename
parsed['new_fpath'] = [join(download_dir, _fname) for _fname in parsed['new_fname']]
# Remember the original filename
prefix = commonprefix(parsed['img_url'])
parsed['orig_fname'] = [url_[len(prefix) :] for url_ in parsed['img_url']]
# Parse out the extension
parsed['ext'] = [splitext(_fname)[-1] for _fname in parsed['new_fname']]
return parsed
[docs]def postprocess_extfilter(parsed):
# Filter based on image type (keep only jpgs)
valid_exts = ['.jpg', '.jpeg', '.png']
# , '.bmp', '.gif']
ext_flags = [ext_.lower() in valid_exts for ext_ in parsed['ext']]
invalid_exts = parsed.compress(ut.not_list(ext_flags))['ext']
parsed = parsed.compress(ext_flags)
num_removed = sum(ut.not_list(ext_flags))
logger.info('Invalid Extensions: ' + ut.repr3(ut.dict_hist(invalid_exts)))
logger.info('Valid Extensions: ' + ut.repr3(ut.dict_hist(parsed['ext'])))
logger.info('Removed %d images based on extensions' % (num_removed,))
return parsed
[docs]def postprocess_tags_build(parsed):
if False:
parsed._meta['ignore'] = [
'ext',
'orig_fname',
'new_fname',
'img_url',
'new_fpath',
'encounter',
'localid',
'suffix',
'nameid',
]
parsed._meta['max_lines_start'] = 30
parsed._meta['max_lines_end'] = 30
parsed.print()
parsed.compress(ut.and_lists(parsed['fname_tags'], parsed['keywords'])).print()
# Filter to only images matching the appropriate tags
from wbia.scripts import getshark
parsed['fname_tags'] = getshark.parse_shark_fname_tags(parsed['orig_fname'])
# Map keyword/fname tags to standard ia tags
tags_list = ut.zipflat(parsed['fname_tags'], parsed['keywords'])
cleaned_tags = ut.modify_tags(
tags_list,
direct_map=[('c429b13e4d232129014d251c74c60011', 'stranding'), ('', None)],
regex_aug=[
('other_injury', 'injur-other'),
('truncation', 'injur-trunc'),
('nicks', 'injur-nicks'),
('scar', 'injur-scar'),
('bite', 'injur-bite'),
],
)
# cleaned_tags = ut.modify_tags(
# cleaned_tags,
# regex_aug=[
# ('injur-', 'injured'),
# ],
# )
parsed['tags'] = cleaned_tags
return parsed
[docs]def postprocess_tags_filter(parsed):
tag_flags = ut.filterflags_general_tags(
parsed['tags'],
# has_any=['view-left'],
# none_match=['qual.*', 'view-top', 'part-.*', 'cropped'],
)
if all(tag_flags):
logger.info(
'Tags histogram:'
+ ut.repr3(ut.dict_hist(ut.flatten(parsed['tags']), ordered=True))
)
else:
logger.info(
'Tags before choosing:' + ut.repr3(ut.dict_hist(ut.flatten(parsed['tags'])))
)
parsed = parsed.compress(tag_flags)
logger.info(
'Tags after choosing:' + ut.repr3(ut.dict_hist(ut.flatten(parsed['tags'])))
)
num_removed = sum(ut.not_list(tag_flags))
logger.info('Removed %d images based on tags' % (num_removed,))
return parsed
[docs]def download_missing_images(parsed, num=None):
exist_flags = ut.lmap(exists, parsed['new_fpath'])
missing_flags = ut.not_list(exist_flags)
logger.info('nExist = %r / %r' % (sum(exist_flags), len(exist_flags)))
logger.info('nMissing = %r / %r' % (sum(missing_flags), len(exist_flags)))
if any(missing_flags):
missing = parsed.compress(missing_flags)
logger.info('Downloading missing subset')
_iter = list(zip(missing['img_url'], missing['new_fpath']))
if num:
logger.info('Only downloading {}'.format(num))
from concurrent import futures
ex = futures.ProcessPoolExecutor(7)
fs = [
ex.submit(ut.download_url, *args, new=True, verbose=False) for args in _iter
]
for f in ut.ProgIter(
futures.as_completed(fs),
length=len(_iter),
label='downloading wildbook images',
):
pass
# import multiprocessing
# pool = multiprocessing.Pool(7)
# res = pool.map(ut.partial(ut.download_url, new=True, verbose=False), _iter)
# gen = ut.util_parallel.generate2(ut.download_url, zip(_iter), new=True, verbose=False)
# for _ in gen:
# pass
# _prog = ut.ProgPartial(bs=True, freq=1)
# count = 0
# for img_url, new_fpath in _prog(_iter, lbl='downloading wildbook images'):
# #url = img_url
# #filename = new_fpath
# #break
# try:
# ut.download_url(img_url, new_fpath, verbose=False, new=True)
# count += 1
# if num is not None and count > num:
# break
# except (ZeroDivisionError, IOError):
# pass
[docs]def postprocess_corrupted(parsed_dl):
# Remove corrupted or ill-formatted images
import vtool as vt
logger.info('Checking for corrupted images')
fpaths = parsed_dl['new_fpath']
valid_flags = vt.filterflags_valid_images(fpaths, verbose=2)
parsed_dl = parsed_dl.compress(valid_flags)
return parsed_dl
[docs]def postprocess_uuids(parsed_dl):
# Assign uuids based on image content.
# Stride of 1 is what IA uses internally
logger.info('Assigning file based UUID')
_prog = ut.ProgPartial(bs=True, freq=10, adjust=True)
parsed_dl['uuid'] = [
ut.get_file_uuid(fpath_, stride=1)
for fpath_ in _prog(parsed_dl['new_fpath'], lbl='uuid check')
]
return parsed_dl
[docs]def postprocess_rectify_duplicates(unmerged):
"""Rectify duplicate uuid information"""
logger.info('Checking for duplicate information')
# Find rows that have unique uuids
singles = unmerged.get_singles('uuid')
logger.info('There are %d unique images that appear once' % (len(singles)))
# Find rows with duplicate uuid entries
multis = unmerged.get_multis('uuid')
logger.info('There are %d images that appear more than once' % (len(multis)))
# Map other attributes to ordered-sets to join them
multi_keys = ut.setdiff(unmerged.keys(), ['uuid'])
multis.cast_column(multi_keys, lambda v: ut.oset(ut.ensure_iterable(v)))
# Combine rows with the same uuid. (other attributes are set unioned)
merged = multis.merge_rows('uuid')
# Cast sets into lists
merged.cast_column(multi_keys, list)
logger.info('There are %d unique images that have duplicates' % (len(merged)))
# Rectify the duplicate information in the multi columns.
# Tags/Keywords are simply unioned, leave them as is
takeall_keys = ['tags', 'keywords', 'fname_tags']
# Names/Encounters/etc should only take one value.
# Try to fine one, but take multiple if it is ambiguous
takeone_keys = ut.setdiff(multi_keys, takeall_keys)
for key in takeone_keys:
merged.cast_column(key, lambda v: v if len(v) <= 1 else ut.filter_Nones(v))
merged.cast_column(key, lambda v: v[0] if len(v) == 1 else v)
# We need to at least rectify some of the ambiguous information.
# (ie, like where are we going to store the new image?)
# For this just take the first item in the ambiguous list
mustfix_keys = ['new_fpath', 'new_fname']
isambiguous = ut.or_lists(*merged.map_column(mustfix_keys, ut.isiterable))
for key in mustfix_keys:
merged.cast_column(key, lambda v: ut.ensure_iterable(v)[0])
logger.info('Checking for ambiguous columns')
for key in takeone_keys:
isambiguous = merged.map_column(key, ut.isiterable)
num = sum(isambiguous)
if num > 0:
ut.colorprint('X: key=%s has %r ambiguities!' % (key, num), 'red')
# merged.compress(isambiguous).print(keys=[key, 'suffix'])
else:
ut.colorprint('o: key=%s is unambiguous' % (key,), 'green')
# Take the first item from the columns that should only have one value
# merged.cast_column(takeone_keys, lambda v: v[0])
# Deal with animals with multiple names
# merged.cast_column('nameid', lambda v: v if len(v) <= 1 else ut.filter_Nones(v))
# merged.cast_column('nameid', lambda v: v[0] if len(v) == 1 else v)
# print info
# del parsed_dl[['ext', 'localid', 'orig_fname', 'suffix', 'new_fname', 'keywords']]
if False:
merged._meta['ignore'] = [
'img_url',
'orig_fname',
'suffix',
'new_fpath',
'new_fname',
'uuid',
'ext',
'fname_tags',
'keywords',
]
merged._meta['max_lines_start'] = 30
merged._meta['max_lines_end'] = 30
merged.print()
# Combine and return the rectifyied information
info = singles + merged
logger.info('Merged duplicates into %d truely unique images' % (len(info)))
return info
# def main():
# try:
# opts, args = getopt.getopt(sys.argv[1:], 'f:u:n:h')
# except getopt.GetoptError:
# usage()
# sys.exit(1)
# filename = None
# url = 'www.whaleshark.org/listImages.jsp'
# number = 0
# # Handle command-line arguments
# for opt, arg in opts:
# if opt == '-h':
# usage()
# sys.exit()
# elif opt == '-f':
# filename = arg
# elif opt == '-u':
# url = arg
# elif opt == '-n':
# try:
# number = int(arg)
# except ValueError:
# usage()
# sys.exit()
# # Open the XML file and extract its contents as a DOM object
# if filename:
# XMLdata = ut.readfrom(filename)
# else:
# XMLdata = ut.url_read(url)
# #with open('XMLData.xml', 'w') as file_:
# # file_.write(XMLdata)
# logger.info('Downloading')
# download_sharks(XMLdata, number)
# #download_sharks(XMLdata, number)
# def usage():
# logger.info('Fetches a number of images from the ECOCEAN shark database.')
# logger.info('Options:')
# logger.info(' -f <FILENAME> - Reads XML data from a file, rather than a URL.')
# logger.info(' -u <URL> - Reads XML data from the given URL.')
# logger.info(' -n <NUMBER> - Number of images to read; if omitted, reads all of them.')
# logger.info(' -h - Prints this help text.')
# if __name__ == '__main__':
# main()