From fd30675124e2ba0fb07fa15af1bec7432cb9f485 Mon Sep 17 00:00:00 2001 From: rr- Date: Fri, 3 Feb 2017 21:20:52 +0100 Subject: [PATCH] server/image-hash: do not depend on image-match While I hold this library in great esteem for its excellent work on implementing the original paper, I have several problems with it: - as of this commit, it (again) has bug fixes unreleased on pip - its code is badly structured - forces OOP and then proceeds @staticmethod everything - bad class design, parameters are repeated in several places - terrible contract of make_record() and generate_signature() - ambiguous parameters: path vs. image path vs. image content - doesn't adhere to PEP-8 - depends on cairo just to render svg images almost no one uses this library with --- server/requirements.txt | 1 - server/szurubooru/func/image_hash.py | 299 +++++++++++++++--- server/szurubooru/func/posts.py | 6 +- .../szurubooru/tests/func/test_image_hash.py | 6 +- 4 files changed, 267 insertions(+), 45 deletions(-) diff --git a/server/requirements.txt b/server/requirements.txt index c200222d..bef7c14d 100644 --- a/server/requirements.txt +++ b/server/requirements.txt @@ -7,7 +7,6 @@ pytest-cov>=2.2.1 freezegun>=0.3.6 coloredlogs==5.0 pycodestyle>=2.0.0 -image-match>=1.1.0 scipy>=0.18.1 elasticsearch>=5.0.0 elasticsearch-dsl>=5.0.0 diff --git a/server/szurubooru/func/image_hash.py b/server/szurubooru/func/image_hash.py index ae368488..c6fc8403 100644 --- a/server/szurubooru/func/image_hash.py +++ b/server/szurubooru/func/image_hash.py @@ -1,11 +1,13 @@ import logging +from io import BytesIO +from datetime import datetime import elasticsearch import elasticsearch_dsl -import xml.etree -from image_match.elasticsearch_driver import SignatureES +import numpy as np +from skimage.color import rgb2gray +from PIL import Image from szurubooru import config, errors - # pylint: disable=invalid-name logger = logging.getLogger(__name__) es = elasticsearch.Elasticsearch([{ @@ -14,11 +16,190 @@ es = elasticsearch.Elasticsearch([{ }]) -def _get_session(): - return SignatureES(es, index=config.config['elasticsearch']['index']) +# Math based on paper from H. Chi Wong, Marshall Bern and David Goldber +# Math code taken from https://github.com/ascribe/image-match +# (which is licensed under Apache 2 license) + +LOWER_PERCENTILE = 5 +UPPER_PERCENTILE = 95 +IDENTICAL_TOLERANCE = 2 / 255. +DISTANCE_CUTOFF = 0.45 +N_LEVELS = 2 +N = 9 +P = None +SAMPLE_WORDS = 16 +MAX_WORDS = 63 +ES_DOC_TYPE = 'image' +ES_MAX_RESULTS = 100 -def _safe_blanket(default_param_factory): +def _preprocess_image(image_or_path): + img = Image.open(BytesIO(image_or_path)) + img = img.convert('RGB') + return rgb2gray(np.asarray(img, dtype=np.uint8)) + + +def _crop_image(image, lower_percentile, upper_percentile): + rw = np.cumsum(np.sum(np.abs(np.diff(image, axis=1)), axis=1)) + cw = np.cumsum(np.sum(np.abs(np.diff(image, axis=0)), axis=0)) + upper_column_limit = np.searchsorted( + cw, np.percentile(cw, upper_percentile), side='left') + lower_column_limit = np.searchsorted( + cw, np.percentile(cw, lower_percentile), side='right') + upper_row_limit = np.searchsorted( + rw, np.percentile(rw, upper_percentile), side='left') + lower_row_limit = np.searchsorted( + rw, np.percentile(rw, lower_percentile), side='right') + if lower_row_limit > upper_row_limit: + lower_row_limit = int(lower_percentile / 100. * image.shape[0]) + upper_row_limit = int(upper_percentile / 100. * image.shape[0]) + if lower_column_limit > upper_column_limit: + lower_column_limit = int(lower_percentile / 100. * image.shape[1]) + upper_column_limit = int(upper_percentile / 100. * image.shape[1]) + return [ + (lower_row_limit, upper_row_limit), + (lower_column_limit, upper_column_limit)] + + +def _normalize_and_threshold(diff_array, identical_tolerance, n_levels): + mask = np.abs(diff_array) < identical_tolerance + diff_array[mask] = 0. + if np.all(mask): + return None + positive_cutoffs = np.percentile( + diff_array[diff_array > 0.], np.linspace(0, 100, n_levels+1)) + negative_cutoffs = np.percentile( + diff_array[diff_array < 0.], np.linspace(100, 0, n_levels+1)) + for level, interval in enumerate( + positive_cutoffs[i:i+2] + for i in range(positive_cutoffs.shape[0] - 1)): + diff_array[ + (diff_array >= interval[0]) & (diff_array <= interval[1])] = \ + level + 1 + for level, interval in enumerate( + negative_cutoffs[i:i+2] + for i in range(negative_cutoffs.shape[0] - 1)): + diff_array[ + (diff_array <= interval[0]) & (diff_array >= interval[1])] = \ + -(level + 1) + return None + + +def _compute_grid_points(image, n, window=None): + if window is None: + window = [(0, image.shape[0]), (0, image.shape[1])] + x_coords = np.linspace(window[0][0], window[0][1], n + 2, dtype=int)[1:-1] + y_coords = np.linspace(window[1][0], window[1][1], n + 2, dtype=int)[1:-1] + return x_coords, y_coords + + +def _compute_mean_level(image, x_coords, y_coords, p): + if p is None: + p = max([2.0, int(0.5 + min(image.shape) / 20.)]) + avg_grey = np.zeros((x_coords.shape[0], y_coords.shape[0])) + for i, x in enumerate(x_coords): + lower_x_lim = int(max([x - p / 2, 0])) + upper_x_lim = int(min([lower_x_lim + p, image.shape[0]])) + for j, y in enumerate(y_coords): + lower_y_lim = int(max([y - p / 2, 0])) + upper_y_lim = int(min([lower_y_lim + p, image.shape[1]])) + avg_grey[i, j] = np.mean( + image[lower_x_lim:upper_x_lim, lower_y_lim:upper_y_lim]) + return avg_grey + + +def _compute_differentials(grey_level_matrix): + flipped = np.fliplr(grey_level_matrix) + right_neighbors = -np.concatenate(( + np.diff(grey_level_matrix), + np.zeros(grey_level_matrix.shape[0]) + .reshape((grey_level_matrix.shape[0], 1))), axis=1) + down_neighbors = -np.concatenate(( + np.diff(grey_level_matrix, axis=0), + np.zeros(grey_level_matrix.shape[1]) + .reshape((1, grey_level_matrix.shape[1])))) + left_neighbors = -np.concatenate( + (right_neighbors[:, -1:], right_neighbors[:, :-1]), axis=1) + up_neighbors = -np.concatenate((down_neighbors[-1:], down_neighbors[:-1])) + diagonals = np.arange( + -grey_level_matrix.shape[0] + 1, grey_level_matrix.shape[0]) + upper_left_neighbors = sum([ + np.diagflat(np.insert(np.diff(np.diag(grey_level_matrix, i)), 0, 0), i) + for i in diagonals]) + upper_right_neighbors = sum([ + np.diagflat(np.insert(np.diff(np.diag(flipped, i)), 0, 0), i) + for i in diagonals]) + lower_right_neighbors = -np.pad( + upper_left_neighbors[1:, 1:], (0, 1), mode='constant') + lower_left_neighbors = -np.pad( + upper_right_neighbors[1:, 1:], (0, 1), mode='constant') + return np.dstack(np.array([ + upper_left_neighbors, + up_neighbors, + np.fliplr(upper_right_neighbors), + left_neighbors, + right_neighbors, + np.fliplr(lower_left_neighbors), + down_neighbors, + lower_right_neighbors])) + + +def _generate_signature(path_or_image): + im_array = _preprocess_image(path_or_image) + image_limits = _crop_image(im_array, + lower_percentile=LOWER_PERCENTILE, + upper_percentile=UPPER_PERCENTILE) + x_coords, y_coords = _compute_grid_points( + im_array, n=N, window=image_limits) + avg_grey = _compute_mean_level(im_array, x_coords, y_coords, p=P) + diff_matrix = _compute_differentials(avg_grey) + _normalize_and_threshold(diff_matrix, + identical_tolerance=IDENTICAL_TOLERANCE, n_levels=N_LEVELS) + return np.ravel(diff_matrix).astype('int8') + + +def _get_words(array, k, n): + word_positions = np.linspace( + 0, array.shape[0], n, endpoint=False).astype('int') + assert k <= array.shape[0] + assert word_positions.shape[0] <= array.shape[0] + words = np.zeros((n, k)).astype('int8') + for i, pos in enumerate(word_positions): + if pos + k <= array.shape[0]: + words[i] = array[pos:pos+k] + else: + temp = array[pos:].copy() + temp.resize(k) + words[i] = temp + _max_contrast(words) + words = _words_to_int(words) + return words + + +def _words_to_int(word_array): + width = word_array.shape[1] + coding_vector = 3**np.arange(width) + return np.dot(word_array + 1, coding_vector) + + +def _max_contrast(array): + array[array > 0] = 1 + array[array < 0] = -1 + return None + + +def _normalized_distance(_target_array, _vec, nan_value=1.0): + target_array = _target_array.astype(int) + vec = _vec.astype(int) + topvec = np.linalg.norm(vec - target_array, axis=1) + norm1 = np.linalg.norm(vec, axis=0) + norm2 = np.linalg.norm(target_array, axis=1) + finvec = topvec / (norm1 + norm2) + finvec[np.isnan(finvec)] = nan_value + return finvec + + +def _safety_blanket(default_param_factory): def wrapper_outer(target_function): def wrapper_inner(*args, **kwargs): try: @@ -28,14 +209,13 @@ def _safe_blanket(default_param_factory): # add_image() return default_param_factory() except elasticsearch.exceptions.ElasticsearchException as ex: - logger.warning('Problem with elastic search: %s' % ex) + logger.warning('Problem with elastic search: %s', ex) raise errors.ThirdPartyError( 'Error connecting to elastic search.') - except xml.etree.ElementTree.ParseError as ex: - # image-match issue #60 + except IOError: raise errors.ProcessingError('Not an image.') except Exception as ex: - raise errors.ThirdPartyError('Unknown error (%s).' % ex) + raise errors.ThirdPartyError('Unknown error (%s).', ex) return wrapper_inner return wrapper_outer @@ -47,53 +227,96 @@ class Lookalike: self.path = path -@_safe_blanket(lambda: None) +@_safety_blanket(lambda: None) def add_image(path, image_content): - if not path or not image_content: - return - session = _get_session() - session.add_image(path=path, img=image_content, bytestream=True) + assert path + assert image_content + signature = _generate_signature(image_content) + words = _get_words(signature, k=SAMPLE_WORDS, n=MAX_WORDS) + + record = { + 'signature': signature.tolist(), + 'path': path, + 'timestamp': datetime.now(), + } + for i in range(MAX_WORDS): + record['simple_word_' + str(i)] = words[i].tolist() + + es.index( + index=config.config['elasticsearch']['index'], + doc_type=ES_DOC_TYPE, + body=record, + refresh=True) -@_safe_blanket(lambda: None) +@_safety_blanket(lambda: None) def delete_image(path): - if not path: - return - session = _get_session() + assert path es.delete_by_query( - index=session.index, - doc_type=session.doc_type, + index=config.config['elasticsearch']['index'], + doc_type=ES_DOC_TYPE, body={'query': {'term': {'path': path}}}) -@_safe_blanket(lambda: []) +@_safety_blanket(lambda: []) def search_by_image(image_content): + signature = _generate_signature(image_content) + words = _get_words(signature, k=SAMPLE_WORDS, n=MAX_WORDS) + + res = es.search( + index=config.config['elasticsearch']['index'], + doc_type=ES_DOC_TYPE, + body={ + 'query': + { + 'bool': + { + 'should': + [ + {'term': {'simple_word_%d' % i: word.tolist()}} + for i, word in enumerate(words) + ] + } + }, + '_source': {'excludes': ['simple_word_*']}}, + size=ES_MAX_RESULTS, + timeout='10s')['hits']['hits'] + + if len(res) == 0: + return [] + + sigs = np.array([x['_source']['signature'] for x in res]) + dists = _normalized_distance(sigs, np.array(signature)) + + ids = set() ret = [] - session = _get_session() - for result in session.search_image( - path=image_content, # sic - bytestream=True): - ret.append(Lookalike( - score=result['score'], - distance=result['dist'], - path=result['path'])) + for item, dist in zip(res, dists): + id = item['_id'] + score = item['_score'] + path = item['_source']['path'] + if id in ids: + continue + ids.add(id) + if dist < DISTANCE_CUTOFF: + ret.append(Lookalike(score=score, distance=dist, path=path)) return ret -@_safe_blanket(lambda: None) +@_safety_blanket(lambda: None) def purge(): - session = _get_session() es.delete_by_query( - index=session.index, - doc_type=session.doc_type, - body={'query': {'match_all': {}}}) + index=config.config['elasticsearch']['index'], + doc_type=ES_DOC_TYPE, + body={'query': {'match_all': {}}}, + refresh=True) -@_safe_blanket(lambda: set()) +@_safety_blanket(lambda: set()) def get_all_paths(): - session = _get_session() search = ( elasticsearch_dsl.Search( - using=es, index=session.index, doc_type=session.doc_type) + using=es, + index=config.config['elasticsearch']['index'], + doc_type=ES_DOC_TYPE) .source(['path'])) return set(h.path for h in search.scan()) diff --git a/server/szurubooru/func/posts.py b/server/szurubooru/func/posts.py index 51d11f31..7c85e8ac 100644 --- a/server/szurubooru/func/posts.py +++ b/server/szurubooru/func/posts.py @@ -268,7 +268,8 @@ def _after_post_update(_mapper, _connection, post): @sqlalchemy.events.event.listens_for(db.Post, 'before_delete') def _before_post_delete(_mapper, _connection, post): - image_hash.delete_image(post.post_id) + if post.post_id: + image_hash.delete_image(post.post_id) def _sync_post_content(post): @@ -279,7 +280,8 @@ def _sync_post_content(post): files.save(get_post_content_path(post), content) delattr(post, '__content') regenerate_thumb = True - if post.type in (db.Post.TYPE_IMAGE, db.Post.TYPE_ANIMATION): + if post.post_id and post.type in ( + db.Post.TYPE_IMAGE, db.Post.TYPE_ANIMATION): image_hash.delete_image(post.post_id) image_hash.add_image(post.post_id, content) diff --git a/server/szurubooru/tests/func/test_image_hash.py b/server/szurubooru/tests/func/test_image_hash.py index 3bbdeba2..becba906 100644 --- a/server/szurubooru/tests/func/test_image_hash.py +++ b/server/szurubooru/tests/func/test_image_hash.py @@ -1,4 +1,3 @@ -from time import sleep from szurubooru.func import image_hash @@ -7,11 +6,10 @@ def test_hashing(read_asset, config_injector): image_hash.purge() image_hash.add_image('test', read_asset('jpeg.jpg')) - sleep(0.1) - paths = image_hash.get_all_paths() results_exact = image_hash.search_by_image(read_asset('jpeg.jpg')) - results_similar = image_hash.search_by_image(read_asset('jpeg-similar.jpg')) + results_similar = image_hash.search_by_image( + read_asset('jpeg-similar.jpg')) assert len(paths) == 1 assert len(results_exact) == 1