From c7461c7f65cfc42e6f33970c47f1030ed2c3a184 Mon Sep 17 00:00:00 2001 From: Shyam Sunder Date: Tue, 5 Jan 2021 15:18:34 -0500 Subject: [PATCH] server/net: improve youtube-dl functionality, enforce size limits --- server/Dockerfile | 2 +- server/requirements.txt | 2 +- server/szurubooru/func/net.py | 85 ++++++++++++------------ server/szurubooru/tests/func/test_net.py | 30 +++------ 4 files changed, 54 insertions(+), 65 deletions(-) diff --git a/server/Dockerfile b/server/Dockerfile index 9a597a13..99b05bc7 100644 --- a/server/Dockerfile +++ b/server/Dockerfile @@ -21,7 +21,7 @@ RUN apk --no-cache add \ && pip3 install --no-cache-dir --disable-pip-version-check \ alembic \ "coloredlogs==5.0" \ - youtube-dl \ + youtube_dl \ && apk --no-cache del py3-pip COPY ./ /opt/app/ diff --git a/server/requirements.txt b/server/requirements.txt index d80ec060..35e56abb 100644 --- a/server/requirements.txt +++ b/server/requirements.txt @@ -9,4 +9,4 @@ pillow>=4.3.0 pynacl>=1.2.1 pytz>=2018.3 pyRFC3339>=1.0 -youtube-dl +youtube_dl diff --git a/server/szurubooru/func/net.py b/server/szurubooru/func/net.py index 4e4c222a..c153162e 100644 --- a/server/szurubooru/func/net.py +++ b/server/szurubooru/func/net.py @@ -1,76 +1,75 @@ import json import logging -import os +import subprocess import urllib.error import urllib.request -from tempfile import NamedTemporaryFile from threading import Thread from typing import Any, Dict, List -from youtube_dl import YoutubeDL -from youtube_dl.utils import YoutubeDLError - from szurubooru import config, errors -from szurubooru.func import mime, util logger = logging.getLogger(__name__) +_dl_chunk_size = 2 ** 15 + + +class DownloadError(errors.ProcessingError): + pass + + +class DownloadTooLargeError(DownloadError): + pass def download(url: str, use_video_downloader: bool = False) -> bytes: assert url + if use_video_downloader: + url = _get_youtube_dl_content_url(url) + request = urllib.request.Request(url) if config.config["user_agent"]: request.add_header("User-Agent", config.config["user_agent"]) request.add_header("Referer", url) - try: - with urllib.request.urlopen(request) as handle: - content = handle.read() - except Exception as ex: - raise errors.ProcessingError("Error downloading %s (%s)" % (url, ex)) - if ( - use_video_downloader - and mime.get_mime_type(content) == "application/octet-stream" - ): - return _youtube_dl_wrapper(url) - return content + + content_buffer = b"" + length_tally = 0 + with urllib.request.urlopen(request) as handle: + while True: + try: + chunk = handle.read(_dl_chunk_size) + except Exception: + raise DownloadError(url) from None + if not chunk: + break + length_tally += len(chunk) + if length_tally > config.config["max_dl_filesize"]: + raise DownloadTooLargeError(url) + content_buffer += chunk + return content_buffer -def _youtube_dl_wrapper(url: str) -> bytes: - outpath = os.path.join( - config.config["data_dir"], - "temporary-uploads", - "youtubedl-" + util.get_sha1(url)[0:8] + ".dat", - ) - options = { - "ignoreerrors": False, - "format": "best[ext=webm]/best[ext=mp4]/best[ext=flv]", - "logger": logger, - "max_filesize": config.config["max_dl_filesize"], - "max_downloads": 1, - "outtmpl": outpath, - } +def _get_youtube_dl_content_url(url: str) -> str: + cmd = ["youtube-dl", "--format", "best"] + if config.config["user_agent"]: + cmd.extend(["--user-agent", config.config["user_agent"]]) + cmd.extend(["--get-url", url]) try: - with YoutubeDL(options) as ydl: - ydl.extract_info(url, download=True) - with open(outpath, "rb") as f: - return f.read() - except YoutubeDLError as ex: - raise errors.ThirdPartyError( - "Error downloading video %s (%s)" % (url, ex) + return ( + subprocess.run(cmd, text=True, capture_output=True, check=True) + .stdout.split("\n")[0] + .strip() ) - except FileNotFoundError: + except subprocess.CalledProcessError: raise errors.ThirdPartyError( - "Error downloading video %s (file could not be saved)" % (url) - ) + "Could not extract content location from %s" % (url) + ) from None def post_to_webhooks(payload: Dict[str, Any]) -> List[Thread]: threads = [ - Thread(target=_post_to_webhook, args=(webhook, payload)) + Thread(target=_post_to_webhook, args=(webhook, payload), daemon=False) for webhook in (config.config["webhooks"] or []) ] for thread in threads: - thread.daemon = False thread.start() return threads diff --git a/server/szurubooru/tests/func/test_net.py b/server/szurubooru/tests/func/test_net.py index f52a3cf4..19e31b6b 100644 --- a/server/szurubooru/tests/func/test_net.py +++ b/server/szurubooru/tests/func/test_net.py @@ -1,6 +1,3 @@ -from datetime import datetime -from unittest.mock import patch - import pytest from szurubooru import errors @@ -69,41 +66,34 @@ def test_download(): "url", [ "https://samples.ffmpeg.org/MPEG-4/video.mp4", + "https://www.youtube.com/watch?v=dQw4w9WgXcQ", ], ) def test_too_large_download(url): - pytest.xfail("Download limit not implemented yet") - with pytest.raises(errors.ProcessingError): - net.download(url) + with pytest.raises(net.DownloadTooLargeError): + net.download(url, use_video_downloader=True) @pytest.mark.parametrize( "url,expected_sha1", [ ( - "https://www.youtube.com/watch?v=C0DPdy98e4c", - "365af1c8f59c6865e1a84c6e13e3e25ff89e0ba1", + "https://gfycat.com/immaterialchillyiberianmole", + "0125976d2439e651b6863438db30de58f79f7754", ), ( - "https://gfycat.com/immaterialchillyiberianmole", - "953000e81d7bd1da95ce264f872e7b6c4a6484be", + "https://upload.wikimedia.org/wikipedia/commons/a/ad/Utah_teapot.png", # noqa: E501 + "cfadcbdeda1204dc1363ee5c1969191f26be2e41", ), ], ) -def test_video_download(url, expected_sha1): - pytest.xfail("Current youtube-dl implementation is unstable") +def test_content_download(url, expected_sha1): actual_content = net.download(url, use_video_downloader=True) assert get_sha1(actual_content) == expected_sha1 -@pytest.mark.parametrize( - "url", - [ - "https://samples.ffmpeg.org/flac/short.flac", # not a video - "https://www.youtube.com/watch?v=dQw4w9WgXcQ", # video too large - ], -) -def test_failed_video_download(url): +def test_bad_content_downlaod(): + url = "http://info.cern.ch/hypertext/WWW/TheProject.html" with pytest.raises(errors.ThirdPartyError): net.download(url, use_video_downloader=True)