Source code for wbia.scripts.getshark

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
from os.path import splitext, join, exists, commonprefix
import utool as ut
import re

(print, rrr, profile) = ut.inject2(__name__, '[getshark]')
logger = logging.getLogger('wbia')


[docs]def sync_wildbook():
    """
    MAIN ENTRY POINT

    Syncronizes our wbia database with a wildbook database like whaleshark.org

    #cd ~/work/WS_ALL
    python -m wbia.scripts.getshark

    cd /media/raid/raw/WhaleSharks_WB/

    >>> from wbia.scripts.getshark import *  # NOQA
    """
    from wbia.scripts import getshark

    # Prepare the output directory for writing, if it doesn't exist

    if True:
        # Read ALL data from whaleshark.org
        parsed = getshark.parse_whaleshark_org()
        db = 'WS_ALL'
        species = 'whale_shark'
        # images_url = 'http://www.whaleshark.org/listImages.jsp'
        # keyword_url = 'http://www.whaleshark.org/getKeywordImages.jsp'
        download_dir = join('/media/raid/raw/Wildbook/', 'sharkimages')
    if False:
        # Read ALL data from whaleshark.org
        images_url = 'http://www.mantamatcher.org/listImages.jsp'
        keyword_url = None
        db = 'Mantas'
        species = 'manta_ray'
        parsed = parse_wildbook(images_url, keyword_url)
        download_dir = join('/media/raid/raw/Wildbook/', 'mantas')
    if False:
        # Read ALL data from whaleshark.org
        images_url = 'http://www.mantamatcher.org/listMatcherImages.jsp'
        keyword_url = None
        db = 'MantaMatcher'
        species = 'manta_ray'
        parsed = parse_wildbook(images_url, keyword_url)
        download_dir = join('/media/raid/raw/Wildbook/', 'MantaMatcher')

    DRY = False
    DRY = True

    ut.ensuredir(download_dir)
    parsed = getshark.postprocess_filenames(parsed, download_dir)
    parsed = getshark.postprocess_extfilter(parsed)
    parsed = getshark.postprocess_tags_build(parsed)
    parsed = getshark.postprocess_tags_filter(parsed)

    # Download images that we dont have yet
    getshark.download_missing_images(parsed)

    if False:
        parsed._meta['ignore'].extend(['fname_tags', 'tags', 'orig_fname'])

    # Change variable name to info now that we downloaded
    parsed_dl = parsed.copy()
    parsed_dl = getshark.postprocess_corrupted(parsed_dl)
    parsed_dl = getshark.postprocess_uuids(parsed_dl)

    # Name change again after time intensive step
    unmerged = parsed_dl.copy()

    # Squash duplicate images
    info = getshark.postprocess_rectify_duplicates(unmerged)

    # Check these images against what currently exists in WS_ALL
    import wbia

    ibs = wbia.opendb(db, allow_newdir=True)
    all_images = ibs.images()

    num_ia_unique = len(set(all_images.uuids) - set(info['uuid']))

    # TODO: Check that all the UUIDs in the IA database are indeed ok

    # Determine which items are in the database
    info_gid_list = ibs.get_image_gids_from_uuid(info['uuid'])
    is_hit = ut.flag_not_None_items(info_gid_list)
    is_miss = ut.flag_None_items(info_gid_list)
    hit_info = info.compress(is_hit)  # NOQA
    miss_info = info.compress(is_miss)  # NOQA
    logger.info('The IA database has %r images' % (len(all_images),))
    logger.info(
        'The IA database has %r/%r images not from this downloaded set'
        % (num_ia_unique, len(all_images))
    )

    logger.info('Have %d/%d parsed images' % (len(hit_info), len(info_gid_list)))
    logger.info('Missing %d/%d parsed images' % (len(miss_info), len(info_gid_list)))

    # Add new info
    if not DRY:
        if len(miss_info):
            add_new_images(ibs, miss_info, species)

    # REFIND Existing
    logger.info('Redoing exist check')
    info_gid_list = ibs.get_image_gids_from_uuid(info['uuid'])
    is_hit = ut.flag_not_None_items(info_gid_list)
    is_miss = ut.flag_None_items(info_gid_list)
    hit_info = info.compress(is_hit)  # NOQA
    miss_info = info.compress(is_miss)  # NOQA
    logger.info('The IA database has %r images' % (len(all_images),))
    logger.info(
        'The IA database has %r/%r images not from this downloaded set'
        % (num_ia_unique, len(all_images))
    )
    logger.info('Have %d/%d parsed images' % (len(hit_info), len(info_gid_list)))
    logger.info('Missing %d/%d parsed images' % (len(miss_info), len(info_gid_list)))

    # Sync existing info
    if True:
        sync_existing_images(ibs, hit_info, species, DRY)


[docs]def sync_existing_images(ibs, hit_info, species, DRY):
    logger.info('Syncing existing images')
    import numpy as np

    # Get info for items already in database
    hit_info['gid'] = ibs.get_image_gids_from_uuid(hit_info['uuid'])
    hit_images = ibs.images(hit_info['gid'])

    # Sync original_uris
    logger.info('Checking uris_original')
    ia_prop_list = hit_images.uris_original
    wb_prop_list = hit_info['img_url']
    dirty_flags = []
    for ia_prop, wb_prop in zip(ia_prop_list, wb_prop_list):
        if not ut.is_listlike(ia_prop) and ut.is_listlike(wb_prop):
            # wb had ambigous values, things are ok if we hit at least one
            flag = ia_prop not in wb_prop
        else:
            flag = ia_prop != wb_prop
        dirty_flags.append(flag)
    # Use the wildbook urls as original uris
    if not any(dirty_flags):
        logger.info('...All %d original uris do not need fixing' % len(hit_images))
    else:
        logger.info(
            '...There are %d/%d original uris that need fixing'
            % (sum(dirty_flags), len(dirty_flags))
        )
        dirty_info = hit_info.compress(dirty_flags)
        dirty_gids = dirty_info['gid']
        dirty_wb_props = dirty_info.map_column(
            'img_url', lambda v: ut.ensure_iterable(v)[0]
        )
        if not DRY:
            logger.info('...Fixing %d original uris' % (sum(dirty_flags),))
            ibs.set_image_uris_original(dirty_gids, dirty_wb_props, overwrite=True)
        else:
            logger.info('\n'.join(hit_images.compress(dirty_flags).uris_original))
            dirty_info.print(keys='img_url')

    # Sync info in annotations
    num_annots = np.array(hit_images.num_annotations)

    is_empty = num_annots == 0
    empty_hit_info = hit_info.compress(is_empty)
    empty_hit_images = hit_images.compress(is_empty)

    is_single = num_annots == 1
    single_hit_info = hit_info.compress(is_single)
    single_hit_images = hit_images.compress(is_single)

    is_multi = num_annots > 1
    multi_hit_info = hit_info.compress(is_multi)
    multi_hit_images = hit_images.compress(is_multi)

    logger.info('Syncing annot info in images. Checking annots/per/image')
    logger.info(' * is_empty = %r' % (is_empty.sum(),))
    logger.info(' * is_single = %r' % (is_single.sum(),))
    logger.info(' * is_multi = %r' % (is_multi.sum(),))

    # We exepect an image to be empty if it has junk in the notes
    nonjunk_empty = [n != 'junk' for n in empty_hit_images.notes]
    logger.info('empty_hit_images = %r' % (empty_hit_images.gids,))
    review_empty = empty_hit_images.compress(nonjunk_empty)
    logger.info('need to review %r empty images' % (len(review_empty),))
    if len(review_empty) > 0:
        empty_hit_info.compress(nonjunk_empty).print()
        logger.info('Please manually review empty images %r' % (review_empty.gids,))

    # We expect multi images to be tagged with primary / secondary
    multi_annots = multi_hit_images._annot_groups
    nontagged = [
        not all(ut.filterflags_general_tags(t, has_any=['primary', 'secondary']))
        for t in multi_annots.case_tags
    ]
    if any(nontagged):
        logger.info('Reviewing untagged multi images')
        review_multi = multi_hit_images.compress(nontagged)
        review_annots = review_multi._annot_groups
        logger.info('review_multi = %r' % (review_multi.gids,))
        multi_hit_info.compress(nontagged).print(
            ignore=[
                'img_url',
                'new_fpath',
                'uuid',
                'encounter',
                'keywords',
                'ext',
                'suffix',
                'fname_tags',
                'orig_fname',
                'new_fname',
            ]
        )
        # Determine the primary annotation
        # Check if any are the entire image
        logger.info('Attempting to handle automatically')
        img_areas = np.prod(review_multi.sizes, axis=1)
        bbox_area_rat = [
            np.array(areas) / a for areas, a in zip(review_annots.bbox_area, img_areas)
        ]
        flag = False
        for areas, g in zip(bbox_area_rat, review_multi):
            if np.any(areas > 0.9):
                logger.info('PLEASE CHECK image %r' % (g,))
                flag = True
        assert not flag, 'need to remove bad annots or change code'
        # assert False, 'Need to finish/review this code. Stopped in development'
        primary_idxs = [np.argsort(areas)[-1:] for areas in bbox_area_rat]
        nonprimary_idxs = [np.argsort(areas)[:-1] for areas in bbox_area_rat]
        multi_primary_annots = ibs.annots(
            ut.flatten(ut.ziptake(review_annots.aids, primary_idxs))
        )
        multi_secondary_annots = ibs.annots(
            ut.flatten(ut.ziptake(review_annots.aids, nonprimary_idxs))
        )
        if not DRY:
            multi_primary_annots.append_tags('primary')
            multi_secondary_annots.append_tags('secondary')
            # set tags indicating fg/bg status

    # Take primary annots from multi-images
    isprimary = [
        ut.filterflags_general_tags(t, has_any=['primary'])
        for t in multi_annots.case_tags
    ]
    assert all(p.sum() == 1 for p in isprimary), 'should only be one primary'
    primary_aids = ut.flatten(ut.zipcompress(multi_annots.aids, isprimary))

    # Combine primarys and single_hit into single
    single_annots = ibs.annots(ut.flatten(single_hit_images.aids) + primary_aids)
    single_info = single_hit_info + multi_hit_info

    # Do annot syncing
    sync_annot_info(ibs, single_annots, single_info, species, DRY)


[docs]def sync_annot_info(ibs, single_annots, single_info, species, DRY):
    """
    sync `info` from wildbook into `annots` from IA.
    """
    import numpy as np

    # single_info._meta['ignore'] = ['img_url', 'new_fpath', 'uuid', 'encounter']
    single_info._meta['ignore'] = ['img_url', 'new_fpath', 'uuid']

    # Associate single annots and info using aids (lists should correspond)
    single_info['aid'] = single_annots.aids

    if True:
        # try and set encounter information
        key1 = 'encounter'
        prop2 = 'static_encounter'
        repl2 = ('____', None)
        is_set = False
        check_annot_disagree(
            single_info, single_annots, key1, prop2, repl2, is_set, DRY=DRY
        )

    # Fix sharks marked as healthy and injured
    isbadmark = ut.filterflags_general_tags(
        single_annots.case_tags, any_startswith='injur-', has_all='healthy', logic='and'
    )
    logger.info('Marked %r as both healthy and injured' % (isbadmark.sum()))
    if np.any(isbadmark):
        bad_fixme = single_annots.compress(isbadmark)
        if not DRY:
            bad_fixme.remove_tags('healthy')

    # cleaned_tags = ut.modify_tags(
    #    single_info['tags'],
    #    regex_map=[
    #        ('view-.*', None)
    #    ],
    # )

    # info_injur_tags = parse_injury_categories(single_info['tags'])
    # annot_injur_tags = parse_injury_categories(single_annots.case_tags)
    # logger.info(ut.repr4(ut.dict_hist(ut.flatten(cleaned_tags))))

    # info_injur_tags = [t if len(t) > 0 else ['healthy'] for t in info_injur_tags]
    # info_injur_tags = [ut.setdiff(t, ['healthy']) for t in info_injur_tags]
    # annot_injur_tags = [ut.setdiff(t, ['healthy']) for t in annot_injur_tags]
    # info_injur_tags = [ut.setdiff(t, ['injur-other']) for t in info_injur_tags]
    # annot_injur_tags = [ut.setdiff(t, ['injur-other']) for t in annot_injur_tags]

    # Remove redundant aliases on IA side
    cleaned_tags = ut.modify_tags(
        single_annots.case_tags,
        direct_map=[
            ('nicks', 'injur-nicks'),
            ('scar', 'injur-scar'),
            ('trunc', 'injur-trunc'),
            ('injur-trtruunc', 'injur-trunc'),
        ],
    )
    # logger.info(ut.repr4(ut.dict_hist(ut.flatten(cleaned_tags))))
    single_info['orig_case_tags'] = ut.lmap(sorted, single_annots.case_tags)
    single_info['clean_case_tags'] = ut.lmap(sorted, cleaned_tags)

    check_annot_disagree(
        single_info,
        single_annots,
        key1='clean_case_tags',
        prop2=None,
        repl2=(None, []),
        is_set=True,
        key2='orig_case_tags',
        DRY=DRY,
    )
    isdirty = [
        x != y
        for x, y in zip(single_info['orig_case_tags'], single_info['clean_case_tags'])
    ]
    dirty_info = single_info.compress(isdirty)
    logger.info('removing redundant info from %r annots' % (len(dirty_info),))
    if not DRY:
        # dirty_info['orig_case_tags']
        ibs.overwrite_annot_case_tags(dirty_info['aid'], dirty_info['clean_case_tags'])

    # Setup injury tags
    info_injur_tags = get_injured_tags(single_info['tags'])
    # annot_injur_tags = get_injured_tags(single_annots.case_tags)
    single_info['injur_tags'] = info_injur_tags
    # single_info['annot_tags'] = annot_injur_tags
    # # Fix injury tags
    # key1 = 'injur_tags'
    # prop2 = None
    # key2 = 'annot_tags'
    # repl2 = (None, [])
    # is_set = True
    # check_annot_disagree(single_info, single_annots, key1, None, repl2, is_set,
    #                     key2=key2, DRY=DRY)
    # We should just be able to do a union on the two sets.
    isdirty = [
        not ut.issubset(t1, t2)
        for t1, t2 in zip(single_info['injur_tags'], single_annots.case_tags)
    ]
    new_injurtags = [
        ut.setdiff(t1, t2)
        for t1, t2 in zip(single_info['injur_tags'], single_annots.case_tags)
    ]
    single_info['new_injurtags'] = new_injurtags
    dirty_info = single_info.compress(isdirty)
    logger.info('new injur tags' + ut.repr4(ut.dict_hist(ut.flatten(new_injurtags))))
    logger.info(
        'unioning new injur tags into %r/%r annots' % (sum(isdirty), len(isdirty))
    )
    if not DRY:
        ibs.append_annot_case_tags(dirty_info['aid'], dirty_info['new_injurtags'])

    # Append all other keywords as well
    cleaned_keywords = ut.modify_tags(single_info['keywords'], direct_map=[('', None)])
    single_info['new_keywords'] = [
        ut.setdiff(t1, t2) for t1, t2 in zip(cleaned_keywords, single_annots.case_tags)
    ]
    isdirty = [len(t) > 0 for t in single_info['new_keywords']]
    dirty_info = single_info.compress(isdirty)
    logger.info(
        'new_keywords' + ut.repr4(ut.dict_hist(ut.flatten(single_info['new_keywords'])))
    )
    logger.info(
        'unioning new_keywords into %r/%r annots' % (len(dirty_info), len(isdirty))
    )
    if not DRY:
        ibs.append_annot_case_tags(dirty_info['aid'], dirty_info['new_keywords'])

    # Check if any other tags need appending
    cleaned_tags = ut.modify_tags(single_info['tags'], direct_map=[('', None)])
    single_info['new_tags'] = [
        ut.setdiff(t1, t2) for t1, t2 in zip(cleaned_tags, single_annots.case_tags)
    ]
    isdirty = [len(t) > 0 for t in single_info['new_tags']]
    dirty_info = single_info.compress(isdirty)
    logger.info('new_tags' + ut.repr4(ut.dict_hist(ut.flatten(single_info['new_tags']))))
    logger.info('unioning new_tags into %r annots' % (len(dirty_info),))
    if not DRY:
        ibs.append_annot_case_tags(dirty_info['aid'], dirty_info['new_tags'])

    # Setup viewpoint
    mapping = [
        ('view-left', 'left'),
        ('view-right', 'right'),
        ('view-back', 'back'),
    ]
    single_info['viewpoint_code'] = [None] * len(single_info)
    for tag, yaw_text in mapping:
        tag_flags = ut.filterflags_general_tags(single_info['tags'], has_any=[tag])
        # setup yaw info
        for idx in ut.where(tag_flags):
            single_info['viewpoint_code'][idx] = yaw_text
    # Fix Viewpoint
    key1 = 'viewpoint_code'
    prop2 = 'viewpoint_code'
    repl2 = ('____', None)
    is_set = False
    check_annot_disagree(single_info, single_annots, key1, prop2, repl2, is_set, DRY=DRY)

    # Fix Names
    key1 = 'nameid'
    prop2 = 'names'
    repl2 = ('____', None)
    is_set = False
    check_annot_disagree(single_info, single_annots, key1, prop2, repl2, is_set, DRY=DRY)

    # Fix Species
    bad_flags = [s == '____' for s in single_annots.species]
    _annots = single_annots.compress(bad_flags)
    logger.info('%d/%d annots need fixed species' % (sum(bad_flags), len(single_annots)))
    if not DRY:
        _annots.species = [species] * len(_annots)

    # Move injured/healthy/untagged to appropriate sets
    injur_tags = get_injured_tags(single_annots.case_tags, include_healthy=True)
    untagged = np.array(ut.lmap(len, injur_tags)) == 0
    untagged_annots = single_annots.compress(untagged)
    untagged_info = single_info.compress(untagged)
    logger.info('%d/%d annots have no tags' % (len(untagged_annots), len(single_annots)))
    logger.info(
        'Tags from WB imgnames:'
        + ut.repr3(ut.dict_hist(ut.flatten(untagged_info['tags'])))
    )
    untagged_images = ibs.images(untagged_annots.gids)
    # Add healthy tag to anything without an injured tag
    if not DRY:
        logger.info('Adding healthy tag to sharked not taged as injured')
        ibs.append_annot_case_tags(
            untagged_annots.aids, ['healthy'] * len(untagged_annots.aids)
        )

    if not DRY:
        untagged_images.append_to_imageset('Untagged')

    categories = get_injur_categories(single_annots.case_tags)
    healthy_flags = ut.filterflags_general_tags(categories, any_startswith=('injur-'))
    injured_flags = ut.filterflags_general_tags(
        single_annots.case_tags, has_any=['healthy']
    )
    num_have = sum(ut.xor_lists(healthy_flags, injured_flags))
    num_miss = len(single_annots) - num_have
    logger.info('missing %d annots' % (num_miss,))

    injured_annots = single_annots.compress(healthy_flags)
    injured_images = ibs.images(ut.unique(injured_annots.gids))
    healthy_annots = single_annots.compress(injured_flags)
    healthy_images = ibs.images(ut.unique(healthy_annots.gids))
    if not DRY:
        #
        injured_images.remove_from_imageset('Probably Healthy')
        healthy_images.remove_from_imageset('Probably Injured')
        #
        injured_images.append_to_imageset('Probably Injured')
        healthy_images.append_to_imageset('Probably Healthy')


[docs]def check_annot_disagree(
    single_info, single_annots, key1, prop2, repl2, is_set, key2=None, DRY=True
):
    info_prop = single_info[key1]
    if key2 is None:
        key2 = 'annot_' + prop2
        annot_prop = getattr(single_annots, prop2)
        annot_prop = [repl2[1] if p == repl2[0] else p for p in annot_prop]
        single_info[key2] = annot_prop
    else:
        annot_prop = single_info[key2]
    out_of_sync = [x != y for x, y in zip(info_prop, annot_prop)]
    if not is_set:

        def isnull(z):
            return z is None

    else:

        def isnull(z):
            return len(z) == 0

    ia_empty = [isnull(y) and not isnull(x) for x, y in zip(info_prop, annot_prop)]
    wb_empty = [isnull(x) and not isnull(y) for x, y in zip(info_prop, annot_prop)]
    disagree = [
        x != y and not isnull(x) and not isnull(y) for x, y in zip(info_prop, annot_prop)
    ]

    if is_set:
        # Like empty, but ia is a pure subset of wb
        ia_is_subset = [
            d and ut.issubset(y, x) for x, y, d in zip(info_prop, annot_prop, disagree)
        ]
        wb_is_subset = [
            d and ut.issubset(x, y) for x, y, d in zip(info_prop, annot_prop, disagree)
        ]
        # There may not be a subset, but there is overlap?
        some_isect = [
            d and len(ut.isect(y, x)) > 0
            for x, y, d in zip(info_prop, annot_prop, disagree)
        ]
        some_isect = ut.and_lists(some_isect, ut.not_list(ia_is_subset))
        some_isect = ut.and_lists(some_isect, ut.not_list(wb_is_subset))
        # Absolutely no overlap
        total_disagree = [
            d and len(ut.isect(y, x)) == 0
            for x, y, d in zip(info_prop, annot_prop, disagree)
        ]

    logger.info('\n--- RECTIFY prop=%r --- ' % (key1,))

    logger.info(
        'Prop=%r has %r/%r out of sync items' % (key1, sum(out_of_sync), len(out_of_sync))
    )
    logger.info(
        'WB has populated info for %r/%r %r' % (sum(ia_empty), len(ia_empty), key1)
    )
    logger.info(
        'IA has populated info for %r/%r %r' % (sum(wb_empty), len(wb_empty), key1)
    )
    logger.info(
        'IA and WB disagree on info for %r/%r %r' % (sum(disagree), len(disagree), key1)
    )
    if is_set:
        logger.info(
            'IA is subset of WB info for %r/%r %r'
            % (sum(ia_is_subset), len(ia_is_subset), key1)
        )
        logger.info(
            'WB is subset of IA info for %r/%r %r'
            % (sum(wb_is_subset), len(wb_is_subset), key1)
        )
        logger.info(
            'WB and IA partial overlap info for %r/%r %r'
            % (sum(some_isect), len(some_isect), key1)
        )
        logger.info(
            'IA and WB total disagree on info for %r/%r %r'
            % (sum(total_disagree), len(total_disagree), key1)
        )

    sub_info = single_info.take_column('gid', 'aid', key1, key2)
    sub_info._meta['ignore'] = []

    logger.info('\n--- DISAGREE DETAILS ---')

    logger.info('IA POPULATED (updates on IA side?)')
    # Do nothing about these
    sub_info.compress(wb_empty).print()
    logger.info('WB POPULATED: (can give)')
    # Pull info from wildbook
    sub_info.compress(ia_empty).print()
    if is_set:
        logger.info('IA subset WB (can give)')
        sub_info.compress(ia_is_subset).print()
        logger.info('WB subset IA (updates on IA side?)')
        sub_info.compress(wb_is_subset).print()
        logger.info('SOME OVERLAP')
        # Have to manually fix
        sub_info.compress(some_isect).print()
        logger.info('TOTAL DISAGREE')
        # Have to manually fix
        sub_info.compress(total_disagree).print()
    else:
        logger.info('DISAGREE')
        # Have to manually fix
        sub_info.compress(disagree).print()

    # Get which annots need modification.
    if is_set:
        flags = ut.or_lists(ia_empty, ia_is_subset)
    else:
        # We can move populated info from wildbook into empty wbia info
        flags = ia_empty

    new_info = sub_info.compress(flags)
    old_annots = single_annots.compress(flags)

    if not is_set:
        # Ensure that there is no ambiguity
        isambiguous = [ut.isscalar(v) for v in new_info[key1]]
        notok = sum(ut.not_list(isambiguous))
        assert notok == 0
        logger.info('There are %d ambiguous properties from wildbook' % (notok,))
        new_info = new_info.compress(isambiguous)
        old_annots = old_annots.compress(isambiguous)
        new_info = sub_info.compress(flags)
        old_annots = single_annots.compress(flags)

    new_prop = new_info[key1]
    if not is_set and len(ut.unique(new_prop)) < 20:
        logger.info('new prop hist')
        logger.info(ut.repr3(ut.dict_hist(new_prop)))
    elif is_set and len(ut.unique(ut.flatten(new_prop))) < 20:
        logger.info('new prop hist')
        logger.info(ut.repr3(ut.tag_hist(new_prop)))

    if not DRY:
        logger.info('MODIFYING PROPERTEIS')
        if len(old_annots) > 0:
            if is_set:
                assert prop2 is None
                assert key1 == 'injur_tags', 'hack is invalid. got={}'.format(key1)
                old_annots.append_tags(new_prop)
            else:
                setattr(old_annots, prop2, new_prop)
    else:
        logger.info('dryrun')


[docs]def get_injured_tags(tags_list, include_healthy=False, invert=False):
    """
    tags_list = single_info['tags']
    tags_list = single_annots.case_tags
    info_injur_tags = parse_injury_categories()
    annot_injur_tags = parse_injury_categories(single_annots.case_tags)
    """
    injur_patterns = [
        'injur-.*',
        'trunc',
        'nicks',
        'bite',
        'scar',
        '.*damage.*',
        '.*scar',
        '.*bite',
        'other_injury',
        'injured',
        'injur',
    ]
    if include_healthy:
        injur_patterns += ['healthy']
    flags_list = [
        [any([re.match(pat, t) for pat in injur_patterns]) for t in tags]
        for tags in tags_list
    ]
    if invert:
        flags_list = ut.lmap(ut.not_list, flags_list)
    only_injur_tags = ut.zipcompress(tags_list, flags_list)
    return only_injur_tags


[docs]def get_injur_categories(single_annots, verbose=False):
    # if verbose:
    #    logger.info('Original tags')
    #    logger.info(ut.repr3(ut.tag_hist(injur_tags)))

    if isinstance(single_annots, list):
        case_tags = single_annots
        aids = list(range(len(single_annots)))
    else:
        case_tags = single_annots.case_tags
        aids = single_annots.aids

    injur_tags = get_injured_tags(case_tags, include_healthy=True)

    cleaned_tags, alias_map, unmapped = ut.modify_tags(
        injur_tags,
        regex_map=[
            # Invalid patterns
            ('^.*' + re.escape('?') + '$', None),
            # Truncation
            ('injur-trunc', 'injur-trunc'),
            ('trunc', 'injur-trunc'),
            # Gill damage
            ('.*gilldamage.*', 'injur-gill'),
            # Other
            ('injur-unknown', 'injur-other'),
            ('injur-dead', 'injur-other'),
            ('other_injury', 'injur-other'),
            ('injur-damage', 'injur-other'),
            ('injured', 'injur-other'),
            ('^injur$', 'injur-other'),
            # Nicks
            ('nicks', 'injur-nicks'),
            ('injur-nicks-.*', 'injur-nicks'),
            ('.*bite', 'injur-bite'),
            ('.*scar', 'injur-scar'),
        ],
        direct_map=[
            ('injur-trunc', 'injur-trunc'),
            ('injur-scar', 'injur-scar'),
            ('injur-other', 'injur-other'),
            ('injur-nicks', 'injur-nicks'),
            ('injur-bite', 'injur-bite'),
            ('healthy', 'healthy'),
        ],
        return_unmapped=True,
        return_map=True,
        delete_unmapped=True,
    )
    assert len(unmapped) == 0, 'fixme %r' % (unmapped,)
    # Remove injur-other if other known injuries are present

    def fixinjur(aid, tags):
        tags = sorted(ut.unique(tags))
        injured = any([t.startswith('injur-') for t in tags])
        if injured:
            if 'healthy' in tags:
                logger.info(
                    'shark aid=%r labeled as injured and healty %r!!!' % (aid, tags)
                )
        if len(tags) == 0:
            return tags
        tags = ut.setdiff(tags, ['injur-other'])
        if injured and len(tags) == 0:
            tags = ['injur-other']
        tags = ut.setdiff(tags, ['injur-gill'])
        if injured and len(tags) == 0:
            tags = ['injur-gill']
        # if len(tags) == 1:
        #    tags = ut.setdiff(tags, ['healthy'])
        return tags

    cleaned_tags = [fixinjur(aid, tags) for aid, tags in zip(aids, cleaned_tags)]
    if verbose:

        logger.info(
            'mapping: ' + ut.repr3(ut.group_items(alias_map.keys(), alias_map.values()))
        )
        logger.info('unmapped = %s' % (ut.repr3(unmapped),))

        given_tags = set(ut.flatten(injur_tags))
        alias_map_used = ut.odict()
        for val, key in alias_map.items():
            if val in given_tags:
                alias_map_used[val] = key
        logger.info(
            'used_mapping: '
            + ut.repr3(ut.group_items(alias_map_used.keys(), alias_map_used.values()))
        )

        logger.info('Cleaned tags')
        hist = ut.tag_hist(cleaned_tags)
        logger.info(ut.repr3(hist))

        # Get tag co-occurrence
        logger.info('Co-Occurrence Freq')
        co_occur = ut.tag_coocurrence(cleaned_tags)
        logger.info(ut.repr3(co_occur))

        logger.info('Co-Occurrence Percent')
        co_occur_percent = ut.odict(
            [
                (keys, [100 * val / hist[k] for k in keys])
                for keys, val in co_occur.items()
            ]
        )
        logger.info(ut.repr3(co_occur_percent, precision=2, nl=1))

    # other_annots = single_annots.compress(ut.filterflags_general_tags(cleaned_tags, has_any=['injur-other']))
    # logger.info('other_annots.case_tags = %s' % (ut.repr4(list(zip(other_annots.gids, other_annots.aids, other_annots.case_tags)), nl=1),))

    return cleaned_tags


[docs]def add_new_images(ibs, miss_info, species):
    import numpy as np

    isambiguous = miss_info.map_column('new_fpath', ut.isiterable)
    assert not any(isambiguous), 'Cannot add ambiguous filenames'

    # Add images to IA to get a gid
    gid_list = ibs.add_images(miss_info['new_fpath'])
    miss_info['gid'] = gid_list

    # Check to see if adding any images failed
    failed_flags = ut.flag_None_items(miss_info['gid'])
    logger.info('# failed to add %s images' % (sum(failed_flags),))

    passed_flags = ut.not_list(failed_flags)
    miss_info = miss_info.compress(passed_flags)

    ut.assert_all_not_None(miss_info['gid'])
    # ibs.get_image_uris_original(clist['gid'])

    assert (
        len(ut.find_duplicate_items(miss_info['gid'])) == 0
    ), 'duplicates should have already been sorted out'

    # Just choose one of the urls if any are ambiguous
    orig_urls = miss_info.map_column('img_url', lambda v: ut.ensure_iterable(v)[0])
    ibs.set_image_uris_original(miss_info['gid'], orig_urls, overwrite=True)

    logger.info('Add new images to temporary imagesets')
    images_new = ibs.images(miss_info['gid'])
    new_imgsettext = 'New Images ' + ut.get_timestamp()
    images_new.append_to_imageset(new_imgsettext)
    injured_keywords = get_injured_tags(miss_info['tags'])
    hasinjur_kw = ut.lmap(bool, injured_keywords)
    images_new.compress(hasinjur_kw).append_to_imageset(new_imgsettext + ' Injur')
    images_new.compress(ut.not_list(hasinjur_kw)).append_to_imageset(
        new_imgsettext + ' Healthy'
    )

    verbose = True
    if verbose:
        other_keywords = get_injured_tags(miss_info['tags'], invert=True)
        logger.info('Added %r new images' % (len(miss_info)))
        logger.info('Of these, %r images had injured tags' % (sum(hasinjur_kw)))
        logger.info(
            'Of these, %r images had other tags' % (sum(ut.lmap(bool, other_keywords)))
        )
        logger.info(
            'Of these, %r images had no injured tags'
            % (len(miss_info) - sum(ut.lmap(bool, injured_keywords)))
        )
        # injured_keyhist = ut.dict_hist(ut.flatten(injured_keywords), ordered=True)
        # other_keyhist = ut.dict_hist(ut.flatten(other_keywords), ordered=True)
        # logger.info('')
        # logger.info('Injured Keyword histogram:\n' + ', '.join(
        #    ['*%s*: %s' % (k, v) for k, v in injured_keyhist.items()][::-1]))
        # logger.info('')
        # logger.info('Other Keyword histogram:\n' + ', '.join(
        #    ['*%s*: %s' % (k, v) for k, v in other_keyhist.items()][::-1]))

    is_empty_annots = np.array(images_new.num_annotations) == 0

    # Add anotations to images
    empty_new_info = miss_info.compress(is_empty_annots)
    empty_new_images = images_new.compress(is_empty_annots)

    # DETECT ANNOTATIONS ON NEW IMAGES
    if ibs.dbname == 'WS_ALL':
        # In the best case we have a detector
        config = {
            'algo': 'yolo',
            'sensitivity': 0.2,
            'config_filepath': ut.truepath(
                '~/work/WS_ALL/localizer_backup/detect.yolo.2.cfg'
            ),
            'weight_filepath': ut.truepath(
                '~/work/WS_ALL/localizer_backup/detect.yolo.2.39000.weights'
            ),
            'class_filepath': ut.truepath(
                '~/work/WS_ALL/localizer_backup/detect.yolo.2.cfg.classes'
            ),
        }
        depc = ibs.depc_image

        images = ibs.images(empty_new_images.gids)
        images = images.compress([ext_ not in ['.gif'] for ext_ in images.exts])
        gid_list = images.gids

        # result is a tuple: (score, bbox_list, theta_list, conf_list, class_list)
        results_list = depc.get_property(
            'localizations', gid_list, None, config=config
        )  # NOQA
        logger.info('Finished running localizations')

        results_list2 = []
        multi_gids = []
        failed_gids = []

        for gid, res in zip(gid_list, results_list):
            score, bbox_list, theta_list, conf_list, class_list = res
            if len(bbox_list) == 0:
                failed_gids.append(gid)
            elif len(bbox_list) == 1:
                results_list2.append((gid, bbox_list, theta_list))
            elif len(bbox_list) > 1:
                # Take only a single annotation per bounding box.
                multi_gids.append(gid)
                idx = conf_list.argmax()
                res2 = (gid, bbox_list[idx : idx + 1], theta_list[idx : idx + 1])
                results_list2.append(res2)

        logger.info('%d/%d have localizations' % (len(results_list2), len(results_list)))
        logger.info(
            '%d/%d are missing localizations' % (len(failed_gids), len(results_list))
        )
        logger.info(
            '%d/%d had multiple localizations' % (len(multi_gids), len(results_list))
        )

        # Add these to an imageset for fixing
        ibs.images(failed_gids).append_to_imageset('NoLocs' + new_imgsettext)
        ibs.images(multi_gids).append_to_imageset('MultiLocs' + new_imgsettext)

        # Reorder empty_info to be aligned with results
        localized_imgs = ibs.images(ut.take_column(results_list2, 0))
        empty_new_info_ = empty_new_info.loc_by_key('gid', localized_imgs.gids)
        assert all(
            [len(a) == 0 for a in localized_imgs.aids]
        ), 'no annots should be made yet'

        # Override old bboxes
        annot_gids = localized_imgs.gids
        annot_bboxes = np.array(ut.take_column(results_list2, 1))[:, 0, :]
        annot_thetas = np.array(ut.take_column(results_list2, 2))[:, 0]
        # Fix any ambiguities for name
        annot_names = empty_new_info_.map_column(
            'nameid', lambda v: ut.ensure_iterable(v)[0]
        )
        annot_names = ut.replace_nones(annot_names, ibs.const.UNKNOWN)
        # annot_names = empty_new_info_['nameid']
        annot_species = [species] * len(localized_imgs)
    else:
        # Make a single annotation for each image in the worst case
        annot_gids = empty_new_images.gids
        annot_bboxes = [(1, 1, w - 2, h - 2) for w, h in empty_new_images.sizes]
        annot_thetas = [0] * len(annot_gids)
        annot_names = empty_new_info.loc_by_key('gid', annot_gids)['nameid']
        annot_names = ut.replace_nones(annot_names, ibs.const.UNKNOWN)
        annot_species = [species] * len(annot_gids)

    aid_list = ibs.add_annots(
        annot_gids,
        bbox_list=annot_bboxes,
        theta_list=annot_thetas,
        name_list=annot_names,
        species_list=annot_species,
    )
    logger.info('Finished adding new info')

    return aid_list


def _needs_redownload(fpath, seconds_thresh):
    if exists(fpath):
        file_info = ut.get_file_info(fpath)
        dt = ut.parse_timestamp(file_info['last_modified'], zone='UTC')
        delta = dt - ut.utcnow_tz()
        redownload = delta.total_seconds() > seconds_thresh
    else:
        redownload = True
    return redownload


[docs]def parse_whaleshark_org():
    """
    Read list of all images from wildbook

    Combines old and new

    >>> from wbia.scripts.getshark import *  # NOQA
    """
    from wbia.scripts import getshark

    parsed1 = getshark.parse_whaleshark_org_old()
    # Also parse using the keyword method
    parsed2 = getshark.parse_whaleshark_org_keywords()

    logger.info('Parsed %d urls from XML jsp' % (len(parsed1),))
    logger.info('Parsed %d urls from keywords' % (len(parsed2),))

    # Apply keywords to existing images
    # raise NotImplementedError('suffix is now unreliable for comparing encounters')

    # Use suffix as a key to create a merger mapping between indices
    logger.info('Merging keyword and XML jsp results')
    suffix_to_idx1 = ut.make_index_lookup(parsed1['suffix'])
    suffix_to_idx2 = ut.make_index_lookup(parsed2['suffix'])
    idx1_to_idx2 = ut.dict_take(suffix_to_idx2, parsed1['suffix'], None)
    idx2_to_idx1 = ut.dict_take(suffix_to_idx1, parsed2['suffix'], None)

    # Find the items that are unique to each set
    unmatched_idx1 = ut.where(ut.not_list(idx1_to_idx2))
    unmatched_idx2 = ut.where(ut.not_list(idx2_to_idx1))
    logger.info('There are %d unique entries in the XML results' % (len(unmatched_idx1),))
    logger.info('There are %d unique entries in the jsp results' % (len(unmatched_idx2),))

    # nonmatching1 = parsed1.take(unmatched_idx1)
    # nonmatching2 = parsed2.take(unmatched_idx2)

    # Find the items that are common between both sets
    match_idx1 = ut.filter_Nones(idx2_to_idx1)
    match_idx2 = ut.filter_Nones(idx1_to_idx2)

    # matching1 = parsed1.take(match_idx1)
    # matching2 = parsed2.take(match_idx2)
    assert len(match_idx1) == len(match_idx2)
    logger.info('There are %d items in common' % (len(match_idx1),))

    # Make columns agree between parsed1 and parsed2
    del parsed1['localid']
    parsed1['uuid'] = [None] * len(parsed1)
    parsed1['keywords'] = [[] for _ in range(len(parsed1))]
    ut.setdiff(parsed2.keys(), parsed1.keys())
    ut.setdiff(parsed1.keys(), parsed2.keys())

    parsed = parsed2 + parsed1
    parsed.cast_column('keywords', ut.oset)
    parsed.cast_column('new_fname', ut.ensure_iterable)
    parsed.cast_column('img_url', ut.ensure_iterable)
    parsed.cast_column('encounter', ut.ensure_iterable)
    parsed = parsed.merge_rows('suffix', merge_scalars=False)
    parsed.cast_column('keywords', list)
    parsed.cast_column('new_fname', lambda v: v[0])
    parsed.cast_column('img_url', lambda v: v[0])
    parsed.cast_column('encounter', lambda v: v[0])

    if True:
        parsed._meta['ignore'] = ['new_fname', 'img_url', 'suffix']
        parsed.print()

    # nonmatching1['nameid'] = [None] * len(nonmatching1)
    # nonmatching1['localid'] = [None] * len(nonmatching1)
    # Merge keywords from matching parts in parsed2 into parsed1
    # parsed1['keywords'] = [[] for _ in range(len(parsed1))]
    # for idx1, keys in zip(match_idx1, matching2['keywords']):
    #    parsed1['keywords'][idx1].extend(keys)

    # parsed = parsed2 + nonmatching1
    logger.info('Parsed %d total urls' % (len(parsed),))
    return parsed


[docs]def parse_whaleshark_org_old():
    url = 'www.whaleshark.org/listImages.jsp'
    parsed1 = parse_wildbook_images(url)
    return parsed1


[docs]def parse_wildbook(images_url, keyword_url=None):
    """
    Read list of all images from wildbook

    Combines old and new

    Example:
        >>> # DISABLE_DOCTEST
        >>> from wbia.scripts.getshark import *  # NOQA
        >>> url = images_url = 'http://www.mantamatcher.org/listImages.jsp'

    Example:
        >>> # DISABLE_DOCTEST
        >>> images_url = 'http://www.whaleshark.org/listImages.jsp'
        >>> keyword_url = 'http://www.whaleshark.org/getKeywordImages.jsp'
    """
    from wbia.scripts import getshark

    parsed1 = getshark.parse_wildbook_images(images_url)
    # Also parse using the keyword method
    # parsed2 = getshark.parse_wildbook_keywords(keyword_url)

    logger.info('Parsed %d urls from XML jsp' % (len(parsed1),))
    # if keyword_url:
    #     logger.info('Parsed %d urls from keywords' % (len(parsed2),))

    # Apply keywords to existing images
    # raise NotImplementedError('suffix is now unreliable for comparing encounters')

    # Use suffix as a key to create a merger mapping between indices
    # logger.info('Merging keyword and XML jsp results')
    # suffix_to_idx1 = ut.make_index_lookup(parsed1['suffix'])
    # suffix_to_idx2 = ut.make_index_lookup(parsed2['suffix'])
    # idx1_to_idx2 = ut.dict_take(suffix_to_idx2, parsed1['suffix'], None)
    # idx2_to_idx1 = ut.dict_take(suffix_to_idx1, parsed2['suffix'], None)

    # Find the items that are unique to each set
    # unmatched_idx1 = ut.where(ut.not_list(idx1_to_idx2))
    # unmatched_idx2 = ut.where(ut.not_list(idx2_to_idx1))
    # logger.info('There are %d unique entries in the XML results' % (len(unmatched_idx1),))
    # logger.info('There are %d unique entries in the jsp results' % (len(unmatched_idx2),))

    # nonmatching1 = parsed1.take(unmatched_idx1)
    # nonmatching2 = parsed2.take(unmatched_idx2)

    # Find the items that are common between both sets
    # match_idx1 = ut.filter_Nones(idx2_to_idx1)
    # match_idx2 = ut.filter_Nones(idx1_to_idx2)

    # assert len(match_idx1) == len(match_idx2)
    # logger.info('There are %d items in common' % (len(match_idx1),))

    # Make columns agree between parsed1 and parsed2
    del parsed1['localid']
    parsed1['uuid'] = [None] * len(parsed1)
    parsed1['keywords'] = [[] for _ in range(len(parsed1))]

    parsed = parsed1
    parsed.cast_column('keywords', ut.oset)
    parsed.cast_column('new_fname', ut.ensure_iterable)
    parsed.cast_column('img_url', ut.ensure_iterable)
    parsed.cast_column('encounter', ut.ensure_iterable)
    parsed = parsed.merge_rows('suffix', merge_scalars=False)
    parsed.cast_column('keywords', list)
    parsed.cast_column('new_fname', lambda v: v[0])
    parsed.cast_column('img_url', lambda v: v[0])
    parsed.cast_column('encounter', lambda v: v[0])

    if True:
        parsed._meta['ignore'] = ['new_fname', 'img_url', 'suffix']
        parsed.print()

    # nonmatching1['nameid'] = [None] * len(nonmatching1)
    # nonmatching1['localid'] = [None] * len(nonmatching1)
    # Merge keywords from matching parts in parsed2 into parsed1
    # parsed1['keywords'] = [[] for _ in range(len(parsed1))]
    # for idx1, keys in zip(match_idx1, matching2['keywords']):
    #    parsed1['keywords'][idx1].extend(keys)

    # parsed = parsed2 + nonmatching1
    logger.info('Parsed %d total urls' % (len(parsed),))
    return parsed


[docs]def parse_wildbook_images(url):
    """
    Example:
        >>> # DISABLE_DOCTEST
        >>> url = 'www.whaleshark.org/listImages.jsp'
        >>> url = images_url = 'http://www.mantamatcher.org/listImages.jsp'
        >>> parse_wildbook_images(url)
    """
    from xml.dom.minidom import parseString
    from wbia.scripts import getshark

    number = None

    cache_dpath = ut.ensure_app_resource_dir('utool', 'sharkinfo')
    cache_fpath = join(cache_dpath, ut.hash_data(url) + '.xml')
    logger.info('cache_fpath = {!r}'.format(cache_fpath))

    # redownload every 30 days or so
    if getshark._needs_redownload(cache_fpath, 60 * 60 * 24 * 30):
        XMLdata = ut.url_read_text(url)
        ut.writeto(cache_fpath, XMLdata)
    else:
        XMLdata = ut.readfrom(cache_fpath)

    # Parse attributes out of XML
    dom = parseString(XMLdata.encode('utf8'))
    if number:
        maxCount = min(number, len(dom.getElementsByTagName('img')))
    else:
        maxCount = len(dom.getElementsByTagName('img'))
    parsed_info = ut.ddict(list)
    logger.info('Reading XML information from %d images...' % maxCount)
    shark_elements = dom.getElementsByTagName('shark')
    _prog = ut.ProgPartial(bs=True, freq=10)
    for shark in _prog(shark_elements, lbl='parsing shark elements'):
        localCount = 0
        for encounter in shark.getElementsByTagName('encounter'):
            for img in encounter.getElementsByTagName('img'):
                localCount += 1

                img_url = img.getAttribute('href')
                ext = splitext(img_url)[1].lower()
                nameid = shark.getAttribute('number')

                new_fname = '%s-%i%s' % (nameid, localCount, ext)

                parsed_info['img_url'].append(img_url)
                parsed_info['nameid'].append(nameid)
                parsed_info['localid'].append(localCount)
                # might be different due to prefix
                parsed_info['new_fname'].append(new_fname)

                parsed_info['encounter'].append(encounter.getAttribute('number'))

                # logger.info('Parsed %i / %i files.' % (len(parsed_info['orig_fname']), maxCount))
                if number is not None and len(parsed_info['orig_fname']) == number:
                    break

    parsed_ = ut.ColumnLists(parsed_info)
    logger.info('Parsed %d urls from XML jsp' % (len(parsed_),))

    # Fix trivial (all non-keyword entries are the same) duplicates
    parsed_.cast_column('localid', lambda x: ut.oset(ut.ensure_iterable(x)))
    parsed_.cast_column('new_fname', lambda x: ut.oset(ut.ensure_iterable(x)))
    parsed1 = parsed_.merge_rows('img_url', merge_scalars=False)
    parsed1.cast_column('localid', lambda x: list(x)[0])
    parsed1.cast_column('new_fname', lambda x: list(x)[0])
    # Check and rectify for duplicate urls

    # Check and rectify for duplicate urls
    # unique_urls, idxs = parsed_.group_indicies('img_url')
    # toremove = []
    # for url, idxs in zip(unique_urls, idxs):
    #    if len(idxs) == 1:
    #        continue
    #    dupinfo = parsed_.take(idxs)
    #    del dupinfo[['localid', 'new_fname', 'img_url']]
    #    can_fix = True
    #    for key, vals in dupinfo.asdict().items():
    #        if not ut.allsame(vals):
    #            logger.info(dupinfo.to_csv())
    #            logger.info(('Duplicate items have different values'))
    #            # May need to fix a case when annoations happen in WB
    #            assert False, 'cant have this happen'
    #            can_fix = False
    #    if can_fix:
    #        toremove += idxs[1:]
    # logger.info('Removing %d duplicate urls' % (len(toremove),))
    # flags = ut.not_list(ut.index_to_boolmask(toremove))
    # parsed1 = parsed_.compress(flags)

    prefix = commonprefix(parsed1['img_url'])
    parsed1['suffix'] = [url_[len(prefix) :] for url_ in parsed1['img_url']]
    return parsed1


[docs]def parse_whaleshark_org_keywords():
    verbose = True

    if verbose:
        logger.info('[keywords] Parsing whaleshark.org keywords')

    # if False:
    #    key_url = 'http://www.whaleshark.org/getKeywordImages.jsp?indexName=nofilter&maxSize=2'
    #    #import requests
    #    #resp = requests.get(url)
    #    #resp.json()

    #    # key_url = 'http://www.whaleshark.org/getKeywordImages.jsp?indexName=nofilter'
    #    # json = cached_json_request(key_url)
    #    # ut.save_data('nofilterwbquery.pkl', json)

    #    #url = 'http://www.whaleshark.org/getKeywordImages.jsp?indexName=truncationleftpec&maxSize=2'
    #    #import requests
    #    #resp = requests.get(url)
    #    #resp.json()

    from wbia.scripts import getshark

    url = 'http://www.whaleshark.org/getKeywordImages.jsp'

    cache_dpath = ut.ensure_app_resource_dir('utool', 'sharkinfo3')

    def cached_json_request(key_url):
        import requests

        cache_fpath = join(cache_dpath, 'req_' + ut.hashstr27(key_url) + '.json')
        if getshark._needs_redownload(cache_fpath, 60 * 60 * 24 * 3000):
            logger.info('Execute request %s' % (key_url,))
            resp = requests.get(key_url)
            logger.info('Got Response')
            assert resp.status_code == 200
            dict_ = resp.json()
            ut.save_data(cache_fpath, dict_)
        else:
            dict_ = ut.load_data(cache_fpath)
        return dict_

    # Read all keyywords
    keywords = cached_json_request(url)['keywords']
    key_list = ut.take_column(keywords, 'indexName')
    if verbose:
        logger.info('[keywords] Keyword indexName:')
        logger.info(ut.indent('\n'.join(sorted(key_list)), '* '))

    # Request all images belonging to each keyword
    request_results = {}
    for key in ut.ProgIter(key_list + ['nofilter'], lbl='reading index', bs=False):
        key_url = url + '?indexName={indexName}'.format(indexName=key)
        request_results[key] = cached_json_request(key_url)

    keyed_images = {}
    for key, val in request_results.items():
        keyed_images[key] = val['images']

    # Flatten nested structure into ColumnList (note this will cause img_url duplicates)
    parsed_info2 = ut.ddict(list)
    for key, images in keyed_images.items():
        for imgdict in images:
            parsed_info2['img_url'].append(imgdict['url'])
            parsed_info2['encounter'].append(imgdict['correspondingEncounterNumber'])
            parsed_info2['nameid'].append(imgdict.get('individualID', None))
            parsed_info2['uuid'].append(imgdict['uuid'])
            parsed_info2['keywords'].append(imgdict['keywords'])
            # parsed_info2['keywords'].append([key])
    parsed2_ = ut.ColumnLists(parsed_info2)

    # Fix trivial (all non-keyword entries are the same) duplicates
    parsed2_.cast_column('keywords', ut.oset)
    parsed2 = parsed2_.merge_rows('img_url', merge_scalars=False)
    parsed2.cast_column('keywords', list)

    assert len(parsed2.get_multis('uuid')) == 0, 'uuids must be unique'

    if verbose:
        injured_keywords = getshark.get_injured_tags(parsed2['keywords'])
        other_keywords = getshark.get_injured_tags(parsed2['keywords'], invert=True)
        injured_keyhist = ut.dict_hist(ut.flatten(injured_keywords), ordered=True)
        other_keyhist = ut.dict_hist(ut.flatten(other_keywords), ordered=True)
        logger.info('Scraped %r images with keywords' % (len(parsed2)))
        logger.info(
            'Of these, %r images had injured tags'
            % (sum(ut.lmap(bool, injured_keywords)))
        )
        logger.info(
            'Of these, %r images had other tags' % (sum(ut.lmap(bool, other_keywords)))
        )
        logger.info(
            'Of these, %r images had no injured tags'
            % (len(parsed2) - sum(ut.lmap(bool, injured_keywords)))
        )

        logger.info('')
        logger.info(
            'Injured Keyword histogram:\n'
            + ', '.join(['*%s*: %s' % (k, v) for k, v in injured_keyhist.items()][::-1])
        )
        logger.info('')
        logger.info(
            'Other Keyword histogram:\n'
            + ', '.join(['*%s*: %s' % (k, v) for k, v in other_keyhist.items()][::-1])
        )

        # Get tag co-occurrence
        logger.info('Injur Keywords Co-Occurrence Freq')
        co_occur = ut.tag_coocurrence(injured_keywords)
        logger.info(ut.repr3(co_occur))
        logger.info('Num co-occurrences: %r' % (sum(co_occur.values())))

        logger.info('Injur Keywords Co-Occurrence Percent')
        co_occur_percent = ut.odict(
            [
                (keys, [100 * val / injured_keyhist[k] for k in keys])
                for keys, val in co_occur.items()
            ]
        )
        logger.info(ut.repr3(co_occur_percent, precision=2, nl=1))

        _ = getshark.get_injur_categories(injured_keywords, verbose=True)  # NOQA

    prefix = commonprefix(parsed2['img_url'])
    parsed2['suffix'] = [url_[len(prefix) :] for url_ in parsed2['img_url']]
    # Hack off encounters so it aggrees with parsed1
    parsed2['suffix'] = [url_.lstrip('encounters/') for url_ in parsed2['suffix']]
    parsed2['new_fname'] = [suffix.replace('/', '--') for suffix in parsed2['suffix']]

    assert len(parsed2.get_multis('suffix')) == 0, 'hack invalidated something'
    return parsed2


[docs]def postprocess_filenames(parsed, download_dir):
    from os.path import commonprefix, basename  # NOQA

    # Create a new filename
    parsed['new_fpath'] = [join(download_dir, _fname) for _fname in parsed['new_fname']]
    # Remember the original filename
    prefix = commonprefix(parsed['img_url'])
    parsed['orig_fname'] = [url_[len(prefix) :] for url_ in parsed['img_url']]
    # Parse out the extension
    parsed['ext'] = [splitext(_fname)[-1] for _fname in parsed['new_fname']]
    return parsed


[docs]def postprocess_extfilter(parsed):
    # Filter based on image type (keep only jpgs)
    valid_exts = ['.jpg', '.jpeg', '.png']
    # , '.bmp', '.gif']
    ext_flags = [ext_.lower() in valid_exts for ext_ in parsed['ext']]

    invalid_exts = parsed.compress(ut.not_list(ext_flags))['ext']
    parsed = parsed.compress(ext_flags)
    num_removed = sum(ut.not_list(ext_flags))
    logger.info('Invalid Extensions: ' + ut.repr3(ut.dict_hist(invalid_exts)))
    logger.info('Valid Extensions: ' + ut.repr3(ut.dict_hist(parsed['ext'])))
    logger.info('Removed %d images based on extensions' % (num_removed,))
    return parsed


[docs]def postprocess_tags_build(parsed):
    if False:
        parsed._meta['ignore'] = [
            'ext',
            'orig_fname',
            'new_fname',
            'img_url',
            'new_fpath',
            'encounter',
            'localid',
            'suffix',
            'nameid',
        ]
        parsed._meta['max_lines_start'] = 30
        parsed._meta['max_lines_end'] = 30
        parsed.print()

        parsed.compress(ut.and_lists(parsed['fname_tags'], parsed['keywords'])).print()

    # Filter to only images matching the appropriate tags
    from wbia.scripts import getshark

    parsed['fname_tags'] = getshark.parse_shark_fname_tags(parsed['orig_fname'])

    # Map keyword/fname tags to standard ia tags
    tags_list = ut.zipflat(parsed['fname_tags'], parsed['keywords'])
    cleaned_tags = ut.modify_tags(
        tags_list,
        direct_map=[('c429b13e4d232129014d251c74c60011', 'stranding'), ('', None)],
        regex_aug=[
            ('other_injury', 'injur-other'),
            ('truncation', 'injur-trunc'),
            ('nicks', 'injur-nicks'),
            ('scar', 'injur-scar'),
            ('bite', 'injur-bite'),
        ],
    )
    # cleaned_tags = ut.modify_tags(
    #    cleaned_tags,
    #    regex_aug=[
    #        ('injur-', 'injured'),
    #    ],
    # )
    parsed['tags'] = cleaned_tags
    return parsed


[docs]def postprocess_tags_filter(parsed):
    tag_flags = ut.filterflags_general_tags(
        parsed['tags'],
        # has_any=['view-left'],
        # none_match=['qual.*', 'view-top', 'part-.*', 'cropped'],
    )
    if all(tag_flags):
        logger.info(
            'Tags histogram:'
            + ut.repr3(ut.dict_hist(ut.flatten(parsed['tags']), ordered=True))
        )
    else:
        logger.info(
            'Tags before choosing:' + ut.repr3(ut.dict_hist(ut.flatten(parsed['tags'])))
        )
        parsed = parsed.compress(tag_flags)
        logger.info(
            'Tags after choosing:' + ut.repr3(ut.dict_hist(ut.flatten(parsed['tags'])))
        )
    num_removed = sum(ut.not_list(tag_flags))
    logger.info('Removed %d images based on tags' % (num_removed,))
    return parsed


[docs]def download_missing_images(parsed, num=None):
    exist_flags = ut.lmap(exists, parsed['new_fpath'])
    missing_flags = ut.not_list(exist_flags)
    logger.info('nExist = %r / %r' % (sum(exist_flags), len(exist_flags)))
    logger.info('nMissing = %r / %r' % (sum(missing_flags), len(exist_flags)))
    if any(missing_flags):
        missing = parsed.compress(missing_flags)
        logger.info('Downloading missing subset')
        _iter = list(zip(missing['img_url'], missing['new_fpath']))
        if num:
            logger.info('Only downloading {}'.format(num))

        from concurrent import futures

        ex = futures.ProcessPoolExecutor(7)
        fs = [
            ex.submit(ut.download_url, *args, new=True, verbose=False) for args in _iter
        ]

        for f in ut.ProgIter(
            futures.as_completed(fs),
            length=len(_iter),
            label='downloading wildbook images',
        ):
            pass

        # import multiprocessing
        # pool = multiprocessing.Pool(7)
        # res = pool.map(ut.partial(ut.download_url, new=True, verbose=False), _iter)

        # gen = ut.util_parallel.generate2(ut.download_url, zip(_iter), new=True, verbose=False)
        # for _ in gen:
        #     pass

        # _prog = ut.ProgPartial(bs=True, freq=1)
        # count = 0
        # for img_url, new_fpath in _prog(_iter, lbl='downloading wildbook images'):
        #     #url = img_url
        #     #filename = new_fpath
        #     #break
        #     try:
        #         ut.download_url(img_url, new_fpath, verbose=False, new=True)
        #         count += 1
        #         if num is not None and count > num:
        #             break
        #     except (ZeroDivisionError, IOError):
        #         pass


[docs]def postprocess_corrupted(parsed_dl):
    # Remove corrupted or ill-formatted images
    import vtool as vt

    logger.info('Checking for corrupted images')
    fpaths = parsed_dl['new_fpath']
    valid_flags = vt.filterflags_valid_images(fpaths, verbose=2)
    parsed_dl = parsed_dl.compress(valid_flags)
    return parsed_dl


[docs]def postprocess_uuids(parsed_dl):
    # Assign uuids based on image content.
    # Stride of 1 is what IA uses internally
    logger.info('Assigning file based UUID')
    _prog = ut.ProgPartial(bs=True, freq=10, adjust=True)
    parsed_dl['uuid'] = [
        ut.get_file_uuid(fpath_, stride=1)
        for fpath_ in _prog(parsed_dl['new_fpath'], lbl='uuid check')
    ]
    return parsed_dl


[docs]def postprocess_rectify_duplicates(unmerged):
    """Rectify duplicate uuid information"""
    logger.info('Checking for duplicate information')
    # Find rows that have unique uuids
    singles = unmerged.get_singles('uuid')
    logger.info('There are %d unique images that appear once' % (len(singles)))

    # Find rows with duplicate uuid entries
    multis = unmerged.get_multis('uuid')
    logger.info('There are %d images that appear more than once' % (len(multis)))
    # Map other attributes to ordered-sets to join them
    multi_keys = ut.setdiff(unmerged.keys(), ['uuid'])
    multis.cast_column(multi_keys, lambda v: ut.oset(ut.ensure_iterable(v)))
    # Combine rows with the same uuid. (other attributes are set unioned)
    merged = multis.merge_rows('uuid')
    # Cast sets into lists
    merged.cast_column(multi_keys, list)
    logger.info('There are %d unique images that have duplicates' % (len(merged)))

    # Rectify the duplicate information in the multi columns.
    # Tags/Keywords are simply unioned, leave them as is
    takeall_keys = ['tags', 'keywords', 'fname_tags']
    # Names/Encounters/etc should only take one value.
    # Try to fine one, but take multiple if it is ambiguous
    takeone_keys = ut.setdiff(multi_keys, takeall_keys)
    for key in takeone_keys:
        merged.cast_column(key, lambda v: v if len(v) <= 1 else ut.filter_Nones(v))
        merged.cast_column(key, lambda v: v[0] if len(v) == 1 else v)

    # We need to at least rectify some of the ambiguous information.
    # (ie, like where are we going to store the new image?)
    # For this just take the first item in the ambiguous list
    mustfix_keys = ['new_fpath', 'new_fname']
    isambiguous = ut.or_lists(*merged.map_column(mustfix_keys, ut.isiterable))
    for key in mustfix_keys:
        merged.cast_column(key, lambda v: ut.ensure_iterable(v)[0])

    logger.info('Checking for ambiguous columns')
    for key in takeone_keys:
        isambiguous = merged.map_column(key, ut.isiterable)
        num = sum(isambiguous)
        if num > 0:
            ut.colorprint('X: key=%s has %r ambiguities!' % (key, num), 'red')
            # merged.compress(isambiguous).print(keys=[key, 'suffix'])
        else:
            ut.colorprint('o: key=%s is unambiguous' % (key,), 'green')

    # Take the first item from the columns that should only have one value
    # merged.cast_column(takeone_keys, lambda v: v[0])
    # Deal with animals with multiple names
    # merged.cast_column('nameid', lambda v: v if len(v) <= 1 else ut.filter_Nones(v))
    # merged.cast_column('nameid', lambda v: v[0] if len(v) == 1 else v)

    # print info
    # del parsed_dl[['ext', 'localid', 'orig_fname', 'suffix', 'new_fname', 'keywords']]
    if False:
        merged._meta['ignore'] = [
            'img_url',
            'orig_fname',
            'suffix',
            'new_fpath',
            'new_fname',
            'uuid',
            'ext',
            'fname_tags',
            'keywords',
        ]
        merged._meta['max_lines_start'] = 30
        merged._meta['max_lines_end'] = 30
        merged.print()

    # Combine and return the rectifyied information
    info = singles + merged
    logger.info('Merged duplicates into %d truely unique images' % (len(info)))
    return info


[docs]def parse_shark_fname_tags(orig_fname_list, dev=False):
    """
    Parses potential tags from the filename. If dev mode is on, then it prints
    out other potential tags you might add.

    >>> orig_fname_list = parsed['orig_fname']
    >>> dev = True
    >>> tags = parse_shark_fname_tags(orig_fname_list, dev=dev)

    """
    import re

    invalid_tag_patterns = [
        re.escape('-'),
        re.escape('(') + '?\\d*' + re.escape(')') + '?',
        '\\d+-\\d+-\\d+',
        '\\d+,',
        '\\d+',
        'vi*',
        'i*v',
        'i+',
        '\\d+th',
        '\\d+nd',
        '\\d+rd',
        'remant',
        'timnfe',
        't',
        'e',
        'sjl',
        'disc',
        'dec',
        'road',
        'easter',
        'western',
        'west',
        'tn',
        '\\d*ap',
        'whaleshark\\d*',
        'shark\\d*',
        'whale\\d*',
        'whalesharking',
        'sharking',
        'whalesharks',
        'whales',
        'picture',
        'australien',
        'australia',
        'nick',
        'tim\\d*',
        'imageset',
        'holiday',
        'visit',
        'tour',
        'trip',
        'pec',
        'sv',
        'a',
        'b',
        'c',
        's',
        'd',
        'h',
        'g' 'gender',
        'sex',
        'img',
        'image',
        'pic',
        'pics',
        'leith',
        'trips',
        'kings',
        'photo',
        'video',
        'media',
        'fix',
        'feeding',
        'nrd',
        'nd',
        'gen',
        'wa',
        'nmp',
        'bo',
        'kd',
        'ow',
        'ne',
        'dsc',
        'nwd',
        'mg',
        'w',
        'mai',
        'blue',
        'stumpy',
        'oea',
        'cbe',
        'edc',
        'knrt',
        'tiws2',
        'ando',
        'adv',
        'str',
        'adventure',
        'camera',
        'tag',
        'id',
        'ws1',
        'ws',
        'gulf',
        'wally',
        'walhai',
        'wags',
        'shark[0-9][a-z]',
        'shark',
        'sharks',
        'reef',
        '720x480',
        'nb',
        'nrdive',
        'tiws',
        'exmouth',
        'nrdive2',
        'ningaloo',
        'ti',
        'nwss',
        '1st',
        'exp',
        'wsnd',
        'cba',
        '3iwsd',
        'c1',
        'nwd2',
        's1l1',
        's1r1' 'encounter',
        'of',
        'and',
        'the',
        'on',
        'to',
        'with',
        'in',
        'up',
        'ws3',
        's2' 'tagged',
        'from',
        'dive',
        'untag',
        'tagtrace',
        'day',
        '\\d*april',
        '\\d*may',
        '\\d*july',
        '\\d*june',
        'apr\\d+' 'ningaloo',
        'ningblue\\d*',
        'kooling',
    ]

    couldbe_tags = [
        'remnant',
        'prop',
        'north' 'shot',
        'professional',
        'red',
        'original',
        'measure',
        'gender',
        'encounter',
    ]

    invalid_tag_patterns += couldbe_tags

    valid_tag_level_set = [
        ['view-left', 'left', 'lhs', 'l', 'leftside'],
        ['view-right', 'right', 'rhs', 'r', 'rightside'],
        ['view-back', 'back'],
        ['view-top', 'top'],
        ['sex-male', 'male', 'm', 'sexm'],
        ['sex-female', 'female', 'f'],
        ['sex-unknown', 'unknown', 'u'],
        ['part-tail', 'tail', 'caudal'],
        ['part-flank', 'side', 'flank'],
        ['part-head', 'head'],
        ['part-pectoral', 'pectoral', 'pec'],
        ['part-dorsal', 'dorsal', 'dorsals'],
        ['part-claspers', 'claspers', 'clasper'],
        ['part-fin', 'fin', 'leftpecfin'],
        ['part-pelvis', 'pelvic'],
        ['part-gill', 'gill'],
        ['cropped', 'crop'],
        ['injur-scar', 'scar', 'scar2', 'scars', 'tailscar'],
        ['injur-bite', 'bite', 'tailbite'],
        ['injur-nicks', 'scratches', 'nicks', 'headnick'],
        ['injur-damage', 'damage'],
        ['injur-trunc', 'trunc'],
        ['injur-other', 'injury'],
        ['notch'],
        ['small'],
        ['qual-resize', 'resize'],
        ['qual-stretched', 'stretched'],
        ['pregnant'],
        ['notpregnant'],
        ['closeup'],
        ['mature'],
        ['ventralid'],
    ]

    cam_tags = [
        ['cam-slr2', 'slr2'],
        ['cam-5m', '5m'],
        ['cam-7m', '7m'],
        ['cam-4m', '4m'],
        ['copy'],
    ]

    invalid_tag_patterns += [re.escape(c) for c in ut.flatten(cam_tags)]
    # valid_tag_level_set += invalid_tag_patterns

    def apply_enum_regex(pat_list):
        enum_endings = [
            '[a-g]',
            '\\d*',
            'i*',
        ]
        expanded_pats = ut.flatten(
            [[pat + end for end in enum_endings] for pat in pat_list]
        )
        return expanded_pats

    def apply_regex_endings(pat_list):
        return [p + '$' for p in pat_list]

    tag_alias_map = {}
    for level_set in valid_tag_level_set:
        main_key = level_set[0]
        for key in level_set:
            tag_alias_map[key] = main_key

    inverse_alias_map = {}
    for level_set in valid_tag_level_set:
        inverse_alias_map[level_set[0]] = level_set

    regex_alias_map = {
        'view-left': apply_regex_endings(
            apply_enum_regex(inverse_alias_map['view-left'])
        ),
        'view-right': apply_regex_endings(
            apply_enum_regex(inverse_alias_map['view-right'])
        ),
    }

    valid_tags = list(inverse_alias_map.keys())

    invalid_tag_patterns = apply_regex_endings(invalid_tag_patterns)

    def parse_all_fname_tags(fname):
        from os.path import basename

        base = basename(splitext(fname)[0])
        # base.replace('(', '')
        # base.replace(')', '')
        _tags = [base]
        for c in ['_', '.', '/', ')', '(', ',']:
            _tags = ut.flatten([t.split(c) for t in _tags])
        _tags = [t.lower() for t in _tags]
        _tags = [tag_alias_map.get(t, t) for t in _tags]
        for key, vals in regex_alias_map.items():
            pat = ut.regex_or(vals)
            _tags = [key if re.match(pat, t) else t for t in _tags]
        pat = ut.regex_or(invalid_tag_patterns)
        _tags = [t for t in _tags if not re.match(pat, t)]
        _tags = ut.unique_ordered(_tags)
        return _tags

    all_img_tag_list = list(map(parse_all_fname_tags, orig_fname_list))

    known_img_tag_list = [
        list(set(tags).intersection(set(valid_tags))) for tags in all_img_tag_list
    ]

    if dev:
        # Help figure out which tags are important
        _parsed_tags = ut.flatten(all_img_tag_list)

        taghist = ut.dict_hist(_parsed_tags)
        taghist = {key: val for key, val in taghist.items() if val > 1}

        unknown_taghist = sorted(
            [(val, key) for key, val in taghist.items() if key not in valid_tags]
        )[::-1]
        known_taghist = sorted(
            [(val, key) for key, val in taghist.items() if key in valid_tags]
        )[::-1]

        logger.info('Unknown')
        logger.info(ut.repr2(unknown_taghist[0:100][::-1], nl=1))

        logger.info('Known')
        logger.info(ut.repr2(known_taghist[0:100][::-1], nl=1))

        logger.info(
            ut.repr2(
                ut.dict_hist(ut.flatten(known_img_tag_list)), key_order_metric='val', nl=1
            )
        )

    return known_img_tag_list


# def main():
#    try:
#        opts, args = getopt.getopt(sys.argv[1:], 'f:u:n:h')
#    except getopt.GetoptError:
#        usage()
#        sys.exit(1)

#    filename = None
#    url = 'www.whaleshark.org/listImages.jsp'
#    number = 0

#    # Handle command-line arguments
#    for opt, arg in opts:
#        if opt == '-h':
#            usage()
#            sys.exit()
#        elif opt == '-f':
#            filename = arg
#        elif opt == '-u':
#            url = arg
#        elif opt == '-n':
#            try:
#                number = int(arg)
#            except ValueError:
#                usage()
#                sys.exit()

#    # Open the XML file and extract its contents as a DOM object
#    if filename:
#        XMLdata = ut.readfrom(filename)
#    else:
#        XMLdata = ut.url_read(url)
#        #with open('XMLData.xml', 'w') as file_:
#        #    file_.write(XMLdata)
#    logger.info('Downloading')
#    download_sharks(XMLdata, number)
#    #download_sharks(XMLdata, number)


# def usage():
#    logger.info('Fetches a number of images from the ECOCEAN shark database.')
#    logger.info('Options:')
#    logger.info('  -f <FILENAME> - Reads XML data from a file, rather than a URL.')
#    logger.info('  -u <URL> - Reads XML data from the given URL.')
#    logger.info('  -n <NUMBER> - Number of images to read; if omitted, reads all of them.')
#    logger.info('  -h - Prints this help text.')


# if __name__ == '__main__':
#    main()