server/image-hash: do not depend on image-match

While I hold this library in great esteem for its excellent work on
implementing the original paper, I have several problems with it:

- as of this commit, it (again) has bug fixes unreleased on pip
- its code is badly structured
    - forces OOP and then proceeds @staticmethod everything
    - bad class design, parameters are repeated in several places
    - terrible contract of make_record() and generate_signature()
    - ambiguous parameters: path vs. image path vs. image content
    - doesn't adhere to PEP-8
- depends on cairo just to render svg images almost no one uses this
  library with
This commit is contained in:
rr- 2017-02-03 21:20:52 +01:00
parent 894cd29511
commit fd30675124
4 changed files with 267 additions and 45 deletions

View file

@ -7,7 +7,6 @@ pytest-cov>=2.2.1
freezegun>=0.3.6 freezegun>=0.3.6
coloredlogs==5.0 coloredlogs==5.0
pycodestyle>=2.0.0 pycodestyle>=2.0.0
image-match>=1.1.0
scipy>=0.18.1 scipy>=0.18.1
elasticsearch>=5.0.0 elasticsearch>=5.0.0
elasticsearch-dsl>=5.0.0 elasticsearch-dsl>=5.0.0

View file

@ -1,11 +1,13 @@
import logging import logging
from io import BytesIO
from datetime import datetime
import elasticsearch import elasticsearch
import elasticsearch_dsl import elasticsearch_dsl
import xml.etree import numpy as np
from image_match.elasticsearch_driver import SignatureES from skimage.color import rgb2gray
from PIL import Image
from szurubooru import config, errors from szurubooru import config, errors
# pylint: disable=invalid-name # pylint: disable=invalid-name
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
es = elasticsearch.Elasticsearch([{ es = elasticsearch.Elasticsearch([{
@ -14,11 +16,190 @@ es = elasticsearch.Elasticsearch([{
}]) }])
def _get_session(): # Math based on paper from H. Chi Wong, Marshall Bern and David Goldber
return SignatureES(es, index=config.config['elasticsearch']['index']) # Math code taken from https://github.com/ascribe/image-match
# (which is licensed under Apache 2 license)
LOWER_PERCENTILE = 5
UPPER_PERCENTILE = 95
IDENTICAL_TOLERANCE = 2 / 255.
DISTANCE_CUTOFF = 0.45
N_LEVELS = 2
N = 9
P = None
SAMPLE_WORDS = 16
MAX_WORDS = 63
ES_DOC_TYPE = 'image'
ES_MAX_RESULTS = 100
def _safe_blanket(default_param_factory): def _preprocess_image(image_or_path):
img = Image.open(BytesIO(image_or_path))
img = img.convert('RGB')
return rgb2gray(np.asarray(img, dtype=np.uint8))
def _crop_image(image, lower_percentile, upper_percentile):
rw = np.cumsum(np.sum(np.abs(np.diff(image, axis=1)), axis=1))
cw = np.cumsum(np.sum(np.abs(np.diff(image, axis=0)), axis=0))
upper_column_limit = np.searchsorted(
cw, np.percentile(cw, upper_percentile), side='left')
lower_column_limit = np.searchsorted(
cw, np.percentile(cw, lower_percentile), side='right')
upper_row_limit = np.searchsorted(
rw, np.percentile(rw, upper_percentile), side='left')
lower_row_limit = np.searchsorted(
rw, np.percentile(rw, lower_percentile), side='right')
if lower_row_limit > upper_row_limit:
lower_row_limit = int(lower_percentile / 100. * image.shape[0])
upper_row_limit = int(upper_percentile / 100. * image.shape[0])
if lower_column_limit > upper_column_limit:
lower_column_limit = int(lower_percentile / 100. * image.shape[1])
upper_column_limit = int(upper_percentile / 100. * image.shape[1])
return [
(lower_row_limit, upper_row_limit),
(lower_column_limit, upper_column_limit)]
def _normalize_and_threshold(diff_array, identical_tolerance, n_levels):
mask = np.abs(diff_array) < identical_tolerance
diff_array[mask] = 0.
if np.all(mask):
return None
positive_cutoffs = np.percentile(
diff_array[diff_array > 0.], np.linspace(0, 100, n_levels+1))
negative_cutoffs = np.percentile(
diff_array[diff_array < 0.], np.linspace(100, 0, n_levels+1))
for level, interval in enumerate(
positive_cutoffs[i:i+2]
for i in range(positive_cutoffs.shape[0] - 1)):
diff_array[
(diff_array >= interval[0]) & (diff_array <= interval[1])] = \
level + 1
for level, interval in enumerate(
negative_cutoffs[i:i+2]
for i in range(negative_cutoffs.shape[0] - 1)):
diff_array[
(diff_array <= interval[0]) & (diff_array >= interval[1])] = \
-(level + 1)
return None
def _compute_grid_points(image, n, window=None):
if window is None:
window = [(0, image.shape[0]), (0, image.shape[1])]
x_coords = np.linspace(window[0][0], window[0][1], n + 2, dtype=int)[1:-1]
y_coords = np.linspace(window[1][0], window[1][1], n + 2, dtype=int)[1:-1]
return x_coords, y_coords
def _compute_mean_level(image, x_coords, y_coords, p):
if p is None:
p = max([2.0, int(0.5 + min(image.shape) / 20.)])
avg_grey = np.zeros((x_coords.shape[0], y_coords.shape[0]))
for i, x in enumerate(x_coords):
lower_x_lim = int(max([x - p / 2, 0]))
upper_x_lim = int(min([lower_x_lim + p, image.shape[0]]))
for j, y in enumerate(y_coords):
lower_y_lim = int(max([y - p / 2, 0]))
upper_y_lim = int(min([lower_y_lim + p, image.shape[1]]))
avg_grey[i, j] = np.mean(
image[lower_x_lim:upper_x_lim, lower_y_lim:upper_y_lim])
return avg_grey
def _compute_differentials(grey_level_matrix):
flipped = np.fliplr(grey_level_matrix)
right_neighbors = -np.concatenate((
np.diff(grey_level_matrix),
np.zeros(grey_level_matrix.shape[0])
.reshape((grey_level_matrix.shape[0], 1))), axis=1)
down_neighbors = -np.concatenate((
np.diff(grey_level_matrix, axis=0),
np.zeros(grey_level_matrix.shape[1])
.reshape((1, grey_level_matrix.shape[1]))))
left_neighbors = -np.concatenate(
(right_neighbors[:, -1:], right_neighbors[:, :-1]), axis=1)
up_neighbors = -np.concatenate((down_neighbors[-1:], down_neighbors[:-1]))
diagonals = np.arange(
-grey_level_matrix.shape[0] + 1, grey_level_matrix.shape[0])
upper_left_neighbors = sum([
np.diagflat(np.insert(np.diff(np.diag(grey_level_matrix, i)), 0, 0), i)
for i in diagonals])
upper_right_neighbors = sum([
np.diagflat(np.insert(np.diff(np.diag(flipped, i)), 0, 0), i)
for i in diagonals])
lower_right_neighbors = -np.pad(
upper_left_neighbors[1:, 1:], (0, 1), mode='constant')
lower_left_neighbors = -np.pad(
upper_right_neighbors[1:, 1:], (0, 1), mode='constant')
return np.dstack(np.array([
upper_left_neighbors,
up_neighbors,
np.fliplr(upper_right_neighbors),
left_neighbors,
right_neighbors,
np.fliplr(lower_left_neighbors),
down_neighbors,
lower_right_neighbors]))
def _generate_signature(path_or_image):
im_array = _preprocess_image(path_or_image)
image_limits = _crop_image(im_array,
lower_percentile=LOWER_PERCENTILE,
upper_percentile=UPPER_PERCENTILE)
x_coords, y_coords = _compute_grid_points(
im_array, n=N, window=image_limits)
avg_grey = _compute_mean_level(im_array, x_coords, y_coords, p=P)
diff_matrix = _compute_differentials(avg_grey)
_normalize_and_threshold(diff_matrix,
identical_tolerance=IDENTICAL_TOLERANCE, n_levels=N_LEVELS)
return np.ravel(diff_matrix).astype('int8')
def _get_words(array, k, n):
word_positions = np.linspace(
0, array.shape[0], n, endpoint=False).astype('int')
assert k <= array.shape[0]
assert word_positions.shape[0] <= array.shape[0]
words = np.zeros((n, k)).astype('int8')
for i, pos in enumerate(word_positions):
if pos + k <= array.shape[0]:
words[i] = array[pos:pos+k]
else:
temp = array[pos:].copy()
temp.resize(k)
words[i] = temp
_max_contrast(words)
words = _words_to_int(words)
return words
def _words_to_int(word_array):
width = word_array.shape[1]
coding_vector = 3**np.arange(width)
return np.dot(word_array + 1, coding_vector)
def _max_contrast(array):
array[array > 0] = 1
array[array < 0] = -1
return None
def _normalized_distance(_target_array, _vec, nan_value=1.0):
target_array = _target_array.astype(int)
vec = _vec.astype(int)
topvec = np.linalg.norm(vec - target_array, axis=1)
norm1 = np.linalg.norm(vec, axis=0)
norm2 = np.linalg.norm(target_array, axis=1)
finvec = topvec / (norm1 + norm2)
finvec[np.isnan(finvec)] = nan_value
return finvec
def _safety_blanket(default_param_factory):
def wrapper_outer(target_function): def wrapper_outer(target_function):
def wrapper_inner(*args, **kwargs): def wrapper_inner(*args, **kwargs):
try: try:
@ -28,14 +209,13 @@ def _safe_blanket(default_param_factory):
# add_image() # add_image()
return default_param_factory() return default_param_factory()
except elasticsearch.exceptions.ElasticsearchException as ex: except elasticsearch.exceptions.ElasticsearchException as ex:
logger.warning('Problem with elastic search: %s' % ex) logger.warning('Problem with elastic search: %s', ex)
raise errors.ThirdPartyError( raise errors.ThirdPartyError(
'Error connecting to elastic search.') 'Error connecting to elastic search.')
except xml.etree.ElementTree.ParseError as ex: except IOError:
# image-match issue #60
raise errors.ProcessingError('Not an image.') raise errors.ProcessingError('Not an image.')
except Exception as ex: except Exception as ex:
raise errors.ThirdPartyError('Unknown error (%s).' % ex) raise errors.ThirdPartyError('Unknown error (%s).', ex)
return wrapper_inner return wrapper_inner
return wrapper_outer return wrapper_outer
@ -47,53 +227,96 @@ class Lookalike:
self.path = path self.path = path
@_safe_blanket(lambda: None) @_safety_blanket(lambda: None)
def add_image(path, image_content): def add_image(path, image_content):
if not path or not image_content: assert path
return assert image_content
session = _get_session() signature = _generate_signature(image_content)
session.add_image(path=path, img=image_content, bytestream=True) words = _get_words(signature, k=SAMPLE_WORDS, n=MAX_WORDS)
record = {
'signature': signature.tolist(),
'path': path,
'timestamp': datetime.now(),
}
for i in range(MAX_WORDS):
record['simple_word_' + str(i)] = words[i].tolist()
es.index(
index=config.config['elasticsearch']['index'],
doc_type=ES_DOC_TYPE,
body=record,
refresh=True)
@_safe_blanket(lambda: None) @_safety_blanket(lambda: None)
def delete_image(path): def delete_image(path):
if not path: assert path
return
session = _get_session()
es.delete_by_query( es.delete_by_query(
index=session.index, index=config.config['elasticsearch']['index'],
doc_type=session.doc_type, doc_type=ES_DOC_TYPE,
body={'query': {'term': {'path': path}}}) body={'query': {'term': {'path': path}}})
@_safe_blanket(lambda: []) @_safety_blanket(lambda: [])
def search_by_image(image_content): def search_by_image(image_content):
signature = _generate_signature(image_content)
words = _get_words(signature, k=SAMPLE_WORDS, n=MAX_WORDS)
res = es.search(
index=config.config['elasticsearch']['index'],
doc_type=ES_DOC_TYPE,
body={
'query':
{
'bool':
{
'should':
[
{'term': {'simple_word_%d' % i: word.tolist()}}
for i, word in enumerate(words)
]
}
},
'_source': {'excludes': ['simple_word_*']}},
size=ES_MAX_RESULTS,
timeout='10s')['hits']['hits']
if len(res) == 0:
return []
sigs = np.array([x['_source']['signature'] for x in res])
dists = _normalized_distance(sigs, np.array(signature))
ids = set()
ret = [] ret = []
session = _get_session() for item, dist in zip(res, dists):
for result in session.search_image( id = item['_id']
path=image_content, # sic score = item['_score']
bytestream=True): path = item['_source']['path']
ret.append(Lookalike( if id in ids:
score=result['score'], continue
distance=result['dist'], ids.add(id)
path=result['path'])) if dist < DISTANCE_CUTOFF:
ret.append(Lookalike(score=score, distance=dist, path=path))
return ret return ret
@_safe_blanket(lambda: None) @_safety_blanket(lambda: None)
def purge(): def purge():
session = _get_session()
es.delete_by_query( es.delete_by_query(
index=session.index, index=config.config['elasticsearch']['index'],
doc_type=session.doc_type, doc_type=ES_DOC_TYPE,
body={'query': {'match_all': {}}}) body={'query': {'match_all': {}}},
refresh=True)
@_safe_blanket(lambda: set()) @_safety_blanket(lambda: set())
def get_all_paths(): def get_all_paths():
session = _get_session()
search = ( search = (
elasticsearch_dsl.Search( elasticsearch_dsl.Search(
using=es, index=session.index, doc_type=session.doc_type) using=es,
index=config.config['elasticsearch']['index'],
doc_type=ES_DOC_TYPE)
.source(['path'])) .source(['path']))
return set(h.path for h in search.scan()) return set(h.path for h in search.scan())

View file

@ -268,6 +268,7 @@ def _after_post_update(_mapper, _connection, post):
@sqlalchemy.events.event.listens_for(db.Post, 'before_delete') @sqlalchemy.events.event.listens_for(db.Post, 'before_delete')
def _before_post_delete(_mapper, _connection, post): def _before_post_delete(_mapper, _connection, post):
if post.post_id:
image_hash.delete_image(post.post_id) image_hash.delete_image(post.post_id)
@ -279,7 +280,8 @@ def _sync_post_content(post):
files.save(get_post_content_path(post), content) files.save(get_post_content_path(post), content)
delattr(post, '__content') delattr(post, '__content')
regenerate_thumb = True regenerate_thumb = True
if post.type in (db.Post.TYPE_IMAGE, db.Post.TYPE_ANIMATION): if post.post_id and post.type in (
db.Post.TYPE_IMAGE, db.Post.TYPE_ANIMATION):
image_hash.delete_image(post.post_id) image_hash.delete_image(post.post_id)
image_hash.add_image(post.post_id, content) image_hash.add_image(post.post_id, content)

View file

@ -1,4 +1,3 @@
from time import sleep
from szurubooru.func import image_hash from szurubooru.func import image_hash
@ -7,11 +6,10 @@ def test_hashing(read_asset, config_injector):
image_hash.purge() image_hash.purge()
image_hash.add_image('test', read_asset('jpeg.jpg')) image_hash.add_image('test', read_asset('jpeg.jpg'))
sleep(0.1)
paths = image_hash.get_all_paths() paths = image_hash.get_all_paths()
results_exact = image_hash.search_by_image(read_asset('jpeg.jpg')) results_exact = image_hash.search_by_image(read_asset('jpeg.jpg'))
results_similar = image_hash.search_by_image(read_asset('jpeg-similar.jpg')) results_similar = image_hash.search_by_image(
read_asset('jpeg-similar.jpg'))
assert len(paths) == 1 assert len(paths) == 1
assert len(results_exact) == 1 assert len(results_exact) == 1