server/image-hash: do not depend on image-match
While I hold this library in great esteem for its excellent work on implementing the original paper, I have several problems with it: - as of this commit, it (again) has bug fixes unreleased on pip - its code is badly structured - forces OOP and then proceeds @staticmethod everything - bad class design, parameters are repeated in several places - terrible contract of make_record() and generate_signature() - ambiguous parameters: path vs. image path vs. image content - doesn't adhere to PEP-8 - depends on cairo just to render svg images almost no one uses this library with
This commit is contained in:
parent
894cd29511
commit
fd30675124
4 changed files with 267 additions and 45 deletions
|
@ -7,7 +7,6 @@ pytest-cov>=2.2.1
|
||||||
freezegun>=0.3.6
|
freezegun>=0.3.6
|
||||||
coloredlogs==5.0
|
coloredlogs==5.0
|
||||||
pycodestyle>=2.0.0
|
pycodestyle>=2.0.0
|
||||||
image-match>=1.1.0
|
|
||||||
scipy>=0.18.1
|
scipy>=0.18.1
|
||||||
elasticsearch>=5.0.0
|
elasticsearch>=5.0.0
|
||||||
elasticsearch-dsl>=5.0.0
|
elasticsearch-dsl>=5.0.0
|
||||||
|
|
|
@ -1,11 +1,13 @@
|
||||||
import logging
|
import logging
|
||||||
|
from io import BytesIO
|
||||||
|
from datetime import datetime
|
||||||
import elasticsearch
|
import elasticsearch
|
||||||
import elasticsearch_dsl
|
import elasticsearch_dsl
|
||||||
import xml.etree
|
import numpy as np
|
||||||
from image_match.elasticsearch_driver import SignatureES
|
from skimage.color import rgb2gray
|
||||||
|
from PIL import Image
|
||||||
from szurubooru import config, errors
|
from szurubooru import config, errors
|
||||||
|
|
||||||
|
|
||||||
# pylint: disable=invalid-name
|
# pylint: disable=invalid-name
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
es = elasticsearch.Elasticsearch([{
|
es = elasticsearch.Elasticsearch([{
|
||||||
|
@ -14,11 +16,190 @@ es = elasticsearch.Elasticsearch([{
|
||||||
}])
|
}])
|
||||||
|
|
||||||
|
|
||||||
def _get_session():
|
# Math based on paper from H. Chi Wong, Marshall Bern and David Goldber
|
||||||
return SignatureES(es, index=config.config['elasticsearch']['index'])
|
# Math code taken from https://github.com/ascribe/image-match
|
||||||
|
# (which is licensed under Apache 2 license)
|
||||||
|
|
||||||
|
LOWER_PERCENTILE = 5
|
||||||
|
UPPER_PERCENTILE = 95
|
||||||
|
IDENTICAL_TOLERANCE = 2 / 255.
|
||||||
|
DISTANCE_CUTOFF = 0.45
|
||||||
|
N_LEVELS = 2
|
||||||
|
N = 9
|
||||||
|
P = None
|
||||||
|
SAMPLE_WORDS = 16
|
||||||
|
MAX_WORDS = 63
|
||||||
|
ES_DOC_TYPE = 'image'
|
||||||
|
ES_MAX_RESULTS = 100
|
||||||
|
|
||||||
|
|
||||||
def _safe_blanket(default_param_factory):
|
def _preprocess_image(image_or_path):
|
||||||
|
img = Image.open(BytesIO(image_or_path))
|
||||||
|
img = img.convert('RGB')
|
||||||
|
return rgb2gray(np.asarray(img, dtype=np.uint8))
|
||||||
|
|
||||||
|
|
||||||
|
def _crop_image(image, lower_percentile, upper_percentile):
|
||||||
|
rw = np.cumsum(np.sum(np.abs(np.diff(image, axis=1)), axis=1))
|
||||||
|
cw = np.cumsum(np.sum(np.abs(np.diff(image, axis=0)), axis=0))
|
||||||
|
upper_column_limit = np.searchsorted(
|
||||||
|
cw, np.percentile(cw, upper_percentile), side='left')
|
||||||
|
lower_column_limit = np.searchsorted(
|
||||||
|
cw, np.percentile(cw, lower_percentile), side='right')
|
||||||
|
upper_row_limit = np.searchsorted(
|
||||||
|
rw, np.percentile(rw, upper_percentile), side='left')
|
||||||
|
lower_row_limit = np.searchsorted(
|
||||||
|
rw, np.percentile(rw, lower_percentile), side='right')
|
||||||
|
if lower_row_limit > upper_row_limit:
|
||||||
|
lower_row_limit = int(lower_percentile / 100. * image.shape[0])
|
||||||
|
upper_row_limit = int(upper_percentile / 100. * image.shape[0])
|
||||||
|
if lower_column_limit > upper_column_limit:
|
||||||
|
lower_column_limit = int(lower_percentile / 100. * image.shape[1])
|
||||||
|
upper_column_limit = int(upper_percentile / 100. * image.shape[1])
|
||||||
|
return [
|
||||||
|
(lower_row_limit, upper_row_limit),
|
||||||
|
(lower_column_limit, upper_column_limit)]
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_and_threshold(diff_array, identical_tolerance, n_levels):
|
||||||
|
mask = np.abs(diff_array) < identical_tolerance
|
||||||
|
diff_array[mask] = 0.
|
||||||
|
if np.all(mask):
|
||||||
|
return None
|
||||||
|
positive_cutoffs = np.percentile(
|
||||||
|
diff_array[diff_array > 0.], np.linspace(0, 100, n_levels+1))
|
||||||
|
negative_cutoffs = np.percentile(
|
||||||
|
diff_array[diff_array < 0.], np.linspace(100, 0, n_levels+1))
|
||||||
|
for level, interval in enumerate(
|
||||||
|
positive_cutoffs[i:i+2]
|
||||||
|
for i in range(positive_cutoffs.shape[0] - 1)):
|
||||||
|
diff_array[
|
||||||
|
(diff_array >= interval[0]) & (diff_array <= interval[1])] = \
|
||||||
|
level + 1
|
||||||
|
for level, interval in enumerate(
|
||||||
|
negative_cutoffs[i:i+2]
|
||||||
|
for i in range(negative_cutoffs.shape[0] - 1)):
|
||||||
|
diff_array[
|
||||||
|
(diff_array <= interval[0]) & (diff_array >= interval[1])] = \
|
||||||
|
-(level + 1)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_grid_points(image, n, window=None):
|
||||||
|
if window is None:
|
||||||
|
window = [(0, image.shape[0]), (0, image.shape[1])]
|
||||||
|
x_coords = np.linspace(window[0][0], window[0][1], n + 2, dtype=int)[1:-1]
|
||||||
|
y_coords = np.linspace(window[1][0], window[1][1], n + 2, dtype=int)[1:-1]
|
||||||
|
return x_coords, y_coords
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_mean_level(image, x_coords, y_coords, p):
|
||||||
|
if p is None:
|
||||||
|
p = max([2.0, int(0.5 + min(image.shape) / 20.)])
|
||||||
|
avg_grey = np.zeros((x_coords.shape[0], y_coords.shape[0]))
|
||||||
|
for i, x in enumerate(x_coords):
|
||||||
|
lower_x_lim = int(max([x - p / 2, 0]))
|
||||||
|
upper_x_lim = int(min([lower_x_lim + p, image.shape[0]]))
|
||||||
|
for j, y in enumerate(y_coords):
|
||||||
|
lower_y_lim = int(max([y - p / 2, 0]))
|
||||||
|
upper_y_lim = int(min([lower_y_lim + p, image.shape[1]]))
|
||||||
|
avg_grey[i, j] = np.mean(
|
||||||
|
image[lower_x_lim:upper_x_lim, lower_y_lim:upper_y_lim])
|
||||||
|
return avg_grey
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_differentials(grey_level_matrix):
|
||||||
|
flipped = np.fliplr(grey_level_matrix)
|
||||||
|
right_neighbors = -np.concatenate((
|
||||||
|
np.diff(grey_level_matrix),
|
||||||
|
np.zeros(grey_level_matrix.shape[0])
|
||||||
|
.reshape((grey_level_matrix.shape[0], 1))), axis=1)
|
||||||
|
down_neighbors = -np.concatenate((
|
||||||
|
np.diff(grey_level_matrix, axis=0),
|
||||||
|
np.zeros(grey_level_matrix.shape[1])
|
||||||
|
.reshape((1, grey_level_matrix.shape[1]))))
|
||||||
|
left_neighbors = -np.concatenate(
|
||||||
|
(right_neighbors[:, -1:], right_neighbors[:, :-1]), axis=1)
|
||||||
|
up_neighbors = -np.concatenate((down_neighbors[-1:], down_neighbors[:-1]))
|
||||||
|
diagonals = np.arange(
|
||||||
|
-grey_level_matrix.shape[0] + 1, grey_level_matrix.shape[0])
|
||||||
|
upper_left_neighbors = sum([
|
||||||
|
np.diagflat(np.insert(np.diff(np.diag(grey_level_matrix, i)), 0, 0), i)
|
||||||
|
for i in diagonals])
|
||||||
|
upper_right_neighbors = sum([
|
||||||
|
np.diagflat(np.insert(np.diff(np.diag(flipped, i)), 0, 0), i)
|
||||||
|
for i in diagonals])
|
||||||
|
lower_right_neighbors = -np.pad(
|
||||||
|
upper_left_neighbors[1:, 1:], (0, 1), mode='constant')
|
||||||
|
lower_left_neighbors = -np.pad(
|
||||||
|
upper_right_neighbors[1:, 1:], (0, 1), mode='constant')
|
||||||
|
return np.dstack(np.array([
|
||||||
|
upper_left_neighbors,
|
||||||
|
up_neighbors,
|
||||||
|
np.fliplr(upper_right_neighbors),
|
||||||
|
left_neighbors,
|
||||||
|
right_neighbors,
|
||||||
|
np.fliplr(lower_left_neighbors),
|
||||||
|
down_neighbors,
|
||||||
|
lower_right_neighbors]))
|
||||||
|
|
||||||
|
|
||||||
|
def _generate_signature(path_or_image):
|
||||||
|
im_array = _preprocess_image(path_or_image)
|
||||||
|
image_limits = _crop_image(im_array,
|
||||||
|
lower_percentile=LOWER_PERCENTILE,
|
||||||
|
upper_percentile=UPPER_PERCENTILE)
|
||||||
|
x_coords, y_coords = _compute_grid_points(
|
||||||
|
im_array, n=N, window=image_limits)
|
||||||
|
avg_grey = _compute_mean_level(im_array, x_coords, y_coords, p=P)
|
||||||
|
diff_matrix = _compute_differentials(avg_grey)
|
||||||
|
_normalize_and_threshold(diff_matrix,
|
||||||
|
identical_tolerance=IDENTICAL_TOLERANCE, n_levels=N_LEVELS)
|
||||||
|
return np.ravel(diff_matrix).astype('int8')
|
||||||
|
|
||||||
|
|
||||||
|
def _get_words(array, k, n):
|
||||||
|
word_positions = np.linspace(
|
||||||
|
0, array.shape[0], n, endpoint=False).astype('int')
|
||||||
|
assert k <= array.shape[0]
|
||||||
|
assert word_positions.shape[0] <= array.shape[0]
|
||||||
|
words = np.zeros((n, k)).astype('int8')
|
||||||
|
for i, pos in enumerate(word_positions):
|
||||||
|
if pos + k <= array.shape[0]:
|
||||||
|
words[i] = array[pos:pos+k]
|
||||||
|
else:
|
||||||
|
temp = array[pos:].copy()
|
||||||
|
temp.resize(k)
|
||||||
|
words[i] = temp
|
||||||
|
_max_contrast(words)
|
||||||
|
words = _words_to_int(words)
|
||||||
|
return words
|
||||||
|
|
||||||
|
|
||||||
|
def _words_to_int(word_array):
|
||||||
|
width = word_array.shape[1]
|
||||||
|
coding_vector = 3**np.arange(width)
|
||||||
|
return np.dot(word_array + 1, coding_vector)
|
||||||
|
|
||||||
|
|
||||||
|
def _max_contrast(array):
|
||||||
|
array[array > 0] = 1
|
||||||
|
array[array < 0] = -1
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _normalized_distance(_target_array, _vec, nan_value=1.0):
|
||||||
|
target_array = _target_array.astype(int)
|
||||||
|
vec = _vec.astype(int)
|
||||||
|
topvec = np.linalg.norm(vec - target_array, axis=1)
|
||||||
|
norm1 = np.linalg.norm(vec, axis=0)
|
||||||
|
norm2 = np.linalg.norm(target_array, axis=1)
|
||||||
|
finvec = topvec / (norm1 + norm2)
|
||||||
|
finvec[np.isnan(finvec)] = nan_value
|
||||||
|
return finvec
|
||||||
|
|
||||||
|
|
||||||
|
def _safety_blanket(default_param_factory):
|
||||||
def wrapper_outer(target_function):
|
def wrapper_outer(target_function):
|
||||||
def wrapper_inner(*args, **kwargs):
|
def wrapper_inner(*args, **kwargs):
|
||||||
try:
|
try:
|
||||||
|
@ -28,14 +209,13 @@ def _safe_blanket(default_param_factory):
|
||||||
# add_image()
|
# add_image()
|
||||||
return default_param_factory()
|
return default_param_factory()
|
||||||
except elasticsearch.exceptions.ElasticsearchException as ex:
|
except elasticsearch.exceptions.ElasticsearchException as ex:
|
||||||
logger.warning('Problem with elastic search: %s' % ex)
|
logger.warning('Problem with elastic search: %s', ex)
|
||||||
raise errors.ThirdPartyError(
|
raise errors.ThirdPartyError(
|
||||||
'Error connecting to elastic search.')
|
'Error connecting to elastic search.')
|
||||||
except xml.etree.ElementTree.ParseError as ex:
|
except IOError:
|
||||||
# image-match issue #60
|
|
||||||
raise errors.ProcessingError('Not an image.')
|
raise errors.ProcessingError('Not an image.')
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
raise errors.ThirdPartyError('Unknown error (%s).' % ex)
|
raise errors.ThirdPartyError('Unknown error (%s).', ex)
|
||||||
return wrapper_inner
|
return wrapper_inner
|
||||||
return wrapper_outer
|
return wrapper_outer
|
||||||
|
|
||||||
|
@ -47,53 +227,96 @@ class Lookalike:
|
||||||
self.path = path
|
self.path = path
|
||||||
|
|
||||||
|
|
||||||
@_safe_blanket(lambda: None)
|
@_safety_blanket(lambda: None)
|
||||||
def add_image(path, image_content):
|
def add_image(path, image_content):
|
||||||
if not path or not image_content:
|
assert path
|
||||||
return
|
assert image_content
|
||||||
session = _get_session()
|
signature = _generate_signature(image_content)
|
||||||
session.add_image(path=path, img=image_content, bytestream=True)
|
words = _get_words(signature, k=SAMPLE_WORDS, n=MAX_WORDS)
|
||||||
|
|
||||||
|
record = {
|
||||||
|
'signature': signature.tolist(),
|
||||||
|
'path': path,
|
||||||
|
'timestamp': datetime.now(),
|
||||||
|
}
|
||||||
|
for i in range(MAX_WORDS):
|
||||||
|
record['simple_word_' + str(i)] = words[i].tolist()
|
||||||
|
|
||||||
|
es.index(
|
||||||
|
index=config.config['elasticsearch']['index'],
|
||||||
|
doc_type=ES_DOC_TYPE,
|
||||||
|
body=record,
|
||||||
|
refresh=True)
|
||||||
|
|
||||||
|
|
||||||
@_safe_blanket(lambda: None)
|
@_safety_blanket(lambda: None)
|
||||||
def delete_image(path):
|
def delete_image(path):
|
||||||
if not path:
|
assert path
|
||||||
return
|
|
||||||
session = _get_session()
|
|
||||||
es.delete_by_query(
|
es.delete_by_query(
|
||||||
index=session.index,
|
index=config.config['elasticsearch']['index'],
|
||||||
doc_type=session.doc_type,
|
doc_type=ES_DOC_TYPE,
|
||||||
body={'query': {'term': {'path': path}}})
|
body={'query': {'term': {'path': path}}})
|
||||||
|
|
||||||
|
|
||||||
@_safe_blanket(lambda: [])
|
@_safety_blanket(lambda: [])
|
||||||
def search_by_image(image_content):
|
def search_by_image(image_content):
|
||||||
|
signature = _generate_signature(image_content)
|
||||||
|
words = _get_words(signature, k=SAMPLE_WORDS, n=MAX_WORDS)
|
||||||
|
|
||||||
|
res = es.search(
|
||||||
|
index=config.config['elasticsearch']['index'],
|
||||||
|
doc_type=ES_DOC_TYPE,
|
||||||
|
body={
|
||||||
|
'query':
|
||||||
|
{
|
||||||
|
'bool':
|
||||||
|
{
|
||||||
|
'should':
|
||||||
|
[
|
||||||
|
{'term': {'simple_word_%d' % i: word.tolist()}}
|
||||||
|
for i, word in enumerate(words)
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'_source': {'excludes': ['simple_word_*']}},
|
||||||
|
size=ES_MAX_RESULTS,
|
||||||
|
timeout='10s')['hits']['hits']
|
||||||
|
|
||||||
|
if len(res) == 0:
|
||||||
|
return []
|
||||||
|
|
||||||
|
sigs = np.array([x['_source']['signature'] for x in res])
|
||||||
|
dists = _normalized_distance(sigs, np.array(signature))
|
||||||
|
|
||||||
|
ids = set()
|
||||||
ret = []
|
ret = []
|
||||||
session = _get_session()
|
for item, dist in zip(res, dists):
|
||||||
for result in session.search_image(
|
id = item['_id']
|
||||||
path=image_content, # sic
|
score = item['_score']
|
||||||
bytestream=True):
|
path = item['_source']['path']
|
||||||
ret.append(Lookalike(
|
if id in ids:
|
||||||
score=result['score'],
|
continue
|
||||||
distance=result['dist'],
|
ids.add(id)
|
||||||
path=result['path']))
|
if dist < DISTANCE_CUTOFF:
|
||||||
|
ret.append(Lookalike(score=score, distance=dist, path=path))
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
@_safe_blanket(lambda: None)
|
@_safety_blanket(lambda: None)
|
||||||
def purge():
|
def purge():
|
||||||
session = _get_session()
|
|
||||||
es.delete_by_query(
|
es.delete_by_query(
|
||||||
index=session.index,
|
index=config.config['elasticsearch']['index'],
|
||||||
doc_type=session.doc_type,
|
doc_type=ES_DOC_TYPE,
|
||||||
body={'query': {'match_all': {}}})
|
body={'query': {'match_all': {}}},
|
||||||
|
refresh=True)
|
||||||
|
|
||||||
|
|
||||||
@_safe_blanket(lambda: set())
|
@_safety_blanket(lambda: set())
|
||||||
def get_all_paths():
|
def get_all_paths():
|
||||||
session = _get_session()
|
|
||||||
search = (
|
search = (
|
||||||
elasticsearch_dsl.Search(
|
elasticsearch_dsl.Search(
|
||||||
using=es, index=session.index, doc_type=session.doc_type)
|
using=es,
|
||||||
|
index=config.config['elasticsearch']['index'],
|
||||||
|
doc_type=ES_DOC_TYPE)
|
||||||
.source(['path']))
|
.source(['path']))
|
||||||
return set(h.path for h in search.scan())
|
return set(h.path for h in search.scan())
|
||||||
|
|
|
@ -268,6 +268,7 @@ def _after_post_update(_mapper, _connection, post):
|
||||||
|
|
||||||
@sqlalchemy.events.event.listens_for(db.Post, 'before_delete')
|
@sqlalchemy.events.event.listens_for(db.Post, 'before_delete')
|
||||||
def _before_post_delete(_mapper, _connection, post):
|
def _before_post_delete(_mapper, _connection, post):
|
||||||
|
if post.post_id:
|
||||||
image_hash.delete_image(post.post_id)
|
image_hash.delete_image(post.post_id)
|
||||||
|
|
||||||
|
|
||||||
|
@ -279,7 +280,8 @@ def _sync_post_content(post):
|
||||||
files.save(get_post_content_path(post), content)
|
files.save(get_post_content_path(post), content)
|
||||||
delattr(post, '__content')
|
delattr(post, '__content')
|
||||||
regenerate_thumb = True
|
regenerate_thumb = True
|
||||||
if post.type in (db.Post.TYPE_IMAGE, db.Post.TYPE_ANIMATION):
|
if post.post_id and post.type in (
|
||||||
|
db.Post.TYPE_IMAGE, db.Post.TYPE_ANIMATION):
|
||||||
image_hash.delete_image(post.post_id)
|
image_hash.delete_image(post.post_id)
|
||||||
image_hash.add_image(post.post_id, content)
|
image_hash.add_image(post.post_id, content)
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
from time import sleep
|
|
||||||
from szurubooru.func import image_hash
|
from szurubooru.func import image_hash
|
||||||
|
|
||||||
|
|
||||||
|
@ -7,11 +6,10 @@ def test_hashing(read_asset, config_injector):
|
||||||
image_hash.purge()
|
image_hash.purge()
|
||||||
image_hash.add_image('test', read_asset('jpeg.jpg'))
|
image_hash.add_image('test', read_asset('jpeg.jpg'))
|
||||||
|
|
||||||
sleep(0.1)
|
|
||||||
|
|
||||||
paths = image_hash.get_all_paths()
|
paths = image_hash.get_all_paths()
|
||||||
results_exact = image_hash.search_by_image(read_asset('jpeg.jpg'))
|
results_exact = image_hash.search_by_image(read_asset('jpeg.jpg'))
|
||||||
results_similar = image_hash.search_by_image(read_asset('jpeg-similar.jpg'))
|
results_similar = image_hash.search_by_image(
|
||||||
|
read_asset('jpeg-similar.jpg'))
|
||||||
|
|
||||||
assert len(paths) == 1
|
assert len(paths) == 1
|
||||||
assert len(results_exact) == 1
|
assert len(results_exact) == 1
|
||||||
|
|
Loading…
Reference in a new issue