server/posts: store and provide MD5 checksums

This commit is contained in:
Shyam Sunder 2021-01-05 13:00:13 -05:00
parent 7515b8e605
commit 2bdb072296
7 changed files with 71 additions and 10 deletions

View file

@ -2366,6 +2366,7 @@ One file together with its metadata posted to the site.
"source": <source>,
"type": <type>,
"checksum": <checksum>,
"checksumMD5": <checksum-MD5>,
"canvasWidth": <canvas-width>,
"canvasHeight": <canvas-height>,
"contentUrl": <content-url>,
@ -2426,8 +2427,9 @@ One file together with its metadata posted to the site.
- `"flash"` - Flash animation / game.
- `"youtube"` - Youtube embed.
- `<checksum>`: the file checksum. Used in snapshots to signify changes of the
- `<checksum>`: the SHA1 file checksum. Used in snapshots to signify changes of the
post content.
- `<checksum-MD5>`: the MD5 file checksum.
- `<canvas-width>` and `<canvas-height>`: the original width and height of the
post content.
- `<content-url>`: where the post content is located.

View file

@ -10,7 +10,10 @@ import sqlalchemy.orm.exc
from szurubooru import api, config, db, errors, middleware, rest
from szurubooru.func.file_uploads import purge_old_uploads
from szurubooru.func.posts import update_all_post_signatures
from szurubooru.func.posts import (
update_all_md5_checksums,
update_all_post_signatures,
)
def _map_error(
@ -125,6 +128,12 @@ def purge_old_uploads_daemon() -> None:
time.sleep(60 * 5)
_live_migrations = (
update_all_post_signatures,
update_all_md5_checksums,
)
def create_app() -> Callable[[Any, Any], Any]:
""" Create a WSGI compatible App object. """
validate_config()
@ -134,13 +143,10 @@ def create_app() -> Callable[[Any, Any], Any]:
if config.config["show_sql"]:
logging.getLogger("sqlalchemy.engine").setLevel(logging.INFO)
purge_thread = threading.Thread(target=purge_old_uploads_daemon)
purge_thread.daemon = True
purge_thread.start()
threading.Thread(target=purge_old_uploads_daemon, daemon=True).start()
hashing_thread = threading.Thread(target=update_all_post_signatures)
hashing_thread.daemon = False
hashing_thread.start()
for migration in _live_migrations:
threading.Thread(target=migration, daemon=False).start()
db.session.commit()

View file

@ -174,6 +174,7 @@ class PostSerializer(serialization.BaseSerializer):
"type": self.serialize_type,
"mimeType": self.serialize_mime,
"checksum": self.serialize_checksum,
"checksumMD5": self.serialize_checksum_md5,
"fileSize": self.serialize_file_size,
"canvasWidth": self.serialize_canvas_width,
"canvasHeight": self.serialize_canvas_height,
@ -227,6 +228,9 @@ class PostSerializer(serialization.BaseSerializer):
def serialize_checksum(self) -> Any:
return self.post.checksum
def serialize_checksum_md5(self) -> Any:
return self.post.checksum_md5
def serialize_file_size(self) -> Any:
return self.post.file_size
@ -577,7 +581,25 @@ def update_all_post_signatures() -> None:
post, files.get(get_post_content_path(post))
)
db.session.commit()
logger.info("Hashed Post %d", post.post_id)
logger.info("Created Signature - Post %d", post.post_id)
except Exception as ex:
logger.exception(ex)
def update_all_md5_checksums() -> None:
posts_to_hash = (
db.session.query(model.Post)
.filter(model.Post.checksum_md5 == None) # noqa: E711
.order_by(model.Post.post_id.asc())
.all()
)
for post in posts_to_hash:
try:
post.checksum_md5 = util.get_md5(
files.get(get_post_content_path(post))
)
db.session.commit()
logger.info("Created MD5 - Post %d", post.post_id)
except Exception as ex:
logger.exception(ex)
@ -605,6 +627,7 @@ def update_post_content(post: model.Post, content: Optional[bytes]) -> None:
)
post.checksum = util.get_sha1(content)
post.checksum_md5 = util.get_md5(content)
other_post = (
db.session.query(model.Post)
.filter(model.Post.checksum == post.checksum)

View file

@ -0,0 +1,22 @@
"""
Add MD5 checksums to posts
Revision ID: adcd63ff76a2
Created at: 2021-01-05 17:08:21.741601
"""
import sqlalchemy as sa
from alembic import op
revision = "adcd63ff76a2"
down_revision = "c867abb456b1"
branch_labels = None
depends_on = None
def upgrade():
op.add_column("post", sa.Column("checksum_md5", sa.Unicode(32)))
def downgrade():
op.drop_column("post", "checksum_md5")

View file

@ -217,6 +217,7 @@ class Post(Base):
# content description
type = sa.Column("type", sa.Unicode(32), nullable=False)
checksum = sa.Column("checksum", sa.Unicode(64), nullable=False)
checksum_md5 = sa.Column("checksum_md5", sa.Unicode(32))
file_size = sa.Column("file_size", sa.BigInteger)
canvas_width = sa.Column("image_width", sa.Integer)
canvas_height = sa.Column("image_height", sa.Integer)

View file

@ -91,6 +91,7 @@ def test_too_large_download(url):
],
)
def test_video_download(url, expected_sha1):
pytest.xfail("Current youtube-dl implementation is unstable")
actual_content = net.download(url, use_video_downloader=True)
assert get_sha1(actual_content) == expected_sha1

View file

@ -135,6 +135,7 @@ def test_serialize_post(
post.source = "4gag"
post.type = model.Post.TYPE_IMAGE
post.checksum = "deadbeef"
post.checksum_md5 = "deadbeef"
post.mime_type = "image/jpeg"
post.file_size = 100
post.user = user_factory(name="post author")
@ -219,6 +220,7 @@ def test_serialize_post(
"source": "4gag",
"type": "image",
"checksum": "deadbeef",
"checksumMD5": "deadbeef",
"fileSize": 100,
"canvasWidth": 200,
"canvasHeight": 300,
@ -431,8 +433,11 @@ def test_update_post_content_for_new_post(
expected_type,
output_file_name,
):
with patch("szurubooru.func.util.get_sha1"):
with patch("szurubooru.func.util.get_sha1"), patch(
"szurubooru.func.util.get_md5"
):
util.get_sha1.return_value = "crc"
util.get_md5.return_value = "md5"
config_injector(
{
"data_dir": str(tmpdir.mkdir("data")),
@ -458,6 +463,7 @@ def test_update_post_content_for_new_post(
assert post.mime_type == expected_mime_type
assert post.type == expected_type
assert post.checksum == "crc"
assert post.checksum_md5 == "md5"
assert os.path.exists(output_file_path)
if post.type in (model.Post.TYPE_IMAGE, model.Post.TYPE_ANIMATION):
assert db.session.query(model.PostSignature).count() == 1