server/net: improve youtube-dl functionality, enforce size limits

This commit is contained in:
Shyam Sunder 2021-01-05 15:18:34 -05:00
parent 2dfd1c2192
commit c7461c7f65
4 changed files with 54 additions and 65 deletions

View file

@ -21,7 +21,7 @@ RUN apk --no-cache add \
&& pip3 install --no-cache-dir --disable-pip-version-check \ && pip3 install --no-cache-dir --disable-pip-version-check \
alembic \ alembic \
"coloredlogs==5.0" \ "coloredlogs==5.0" \
youtube-dl \ youtube_dl \
&& apk --no-cache del py3-pip && apk --no-cache del py3-pip
COPY ./ /opt/app/ COPY ./ /opt/app/

View file

@ -9,4 +9,4 @@ pillow>=4.3.0
pynacl>=1.2.1 pynacl>=1.2.1
pytz>=2018.3 pytz>=2018.3
pyRFC3339>=1.0 pyRFC3339>=1.0
youtube-dl youtube_dl

View file

@ -1,76 +1,75 @@
import json import json
import logging import logging
import os import subprocess
import urllib.error import urllib.error
import urllib.request import urllib.request
from tempfile import NamedTemporaryFile
from threading import Thread from threading import Thread
from typing import Any, Dict, List from typing import Any, Dict, List
from youtube_dl import YoutubeDL
from youtube_dl.utils import YoutubeDLError
from szurubooru import config, errors from szurubooru import config, errors
from szurubooru.func import mime, util
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_dl_chunk_size = 2 ** 15
class DownloadError(errors.ProcessingError):
pass
class DownloadTooLargeError(DownloadError):
pass
def download(url: str, use_video_downloader: bool = False) -> bytes: def download(url: str, use_video_downloader: bool = False) -> bytes:
assert url assert url
if use_video_downloader:
url = _get_youtube_dl_content_url(url)
request = urllib.request.Request(url) request = urllib.request.Request(url)
if config.config["user_agent"]: if config.config["user_agent"]:
request.add_header("User-Agent", config.config["user_agent"]) request.add_header("User-Agent", config.config["user_agent"])
request.add_header("Referer", url) request.add_header("Referer", url)
try:
with urllib.request.urlopen(request) as handle: content_buffer = b""
content = handle.read() length_tally = 0
except Exception as ex: with urllib.request.urlopen(request) as handle:
raise errors.ProcessingError("Error downloading %s (%s)" % (url, ex)) while True:
if ( try:
use_video_downloader chunk = handle.read(_dl_chunk_size)
and mime.get_mime_type(content) == "application/octet-stream" except Exception:
): raise DownloadError(url) from None
return _youtube_dl_wrapper(url) if not chunk:
return content break
length_tally += len(chunk)
if length_tally > config.config["max_dl_filesize"]:
raise DownloadTooLargeError(url)
content_buffer += chunk
return content_buffer
def _youtube_dl_wrapper(url: str) -> bytes: def _get_youtube_dl_content_url(url: str) -> str:
outpath = os.path.join( cmd = ["youtube-dl", "--format", "best"]
config.config["data_dir"], if config.config["user_agent"]:
"temporary-uploads", cmd.extend(["--user-agent", config.config["user_agent"]])
"youtubedl-" + util.get_sha1(url)[0:8] + ".dat", cmd.extend(["--get-url", url])
)
options = {
"ignoreerrors": False,
"format": "best[ext=webm]/best[ext=mp4]/best[ext=flv]",
"logger": logger,
"max_filesize": config.config["max_dl_filesize"],
"max_downloads": 1,
"outtmpl": outpath,
}
try: try:
with YoutubeDL(options) as ydl: return (
ydl.extract_info(url, download=True) subprocess.run(cmd, text=True, capture_output=True, check=True)
with open(outpath, "rb") as f: .stdout.split("\n")[0]
return f.read() .strip()
except YoutubeDLError as ex:
raise errors.ThirdPartyError(
"Error downloading video %s (%s)" % (url, ex)
) )
except FileNotFoundError: except subprocess.CalledProcessError:
raise errors.ThirdPartyError( raise errors.ThirdPartyError(
"Error downloading video %s (file could not be saved)" % (url) "Could not extract content location from %s" % (url)
) ) from None
def post_to_webhooks(payload: Dict[str, Any]) -> List[Thread]: def post_to_webhooks(payload: Dict[str, Any]) -> List[Thread]:
threads = [ threads = [
Thread(target=_post_to_webhook, args=(webhook, payload)) Thread(target=_post_to_webhook, args=(webhook, payload), daemon=False)
for webhook in (config.config["webhooks"] or []) for webhook in (config.config["webhooks"] or [])
] ]
for thread in threads: for thread in threads:
thread.daemon = False
thread.start() thread.start()
return threads return threads

View file

@ -1,6 +1,3 @@
from datetime import datetime
from unittest.mock import patch
import pytest import pytest
from szurubooru import errors from szurubooru import errors
@ -69,41 +66,34 @@ def test_download():
"url", "url",
[ [
"https://samples.ffmpeg.org/MPEG-4/video.mp4", "https://samples.ffmpeg.org/MPEG-4/video.mp4",
"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
], ],
) )
def test_too_large_download(url): def test_too_large_download(url):
pytest.xfail("Download limit not implemented yet") with pytest.raises(net.DownloadTooLargeError):
with pytest.raises(errors.ProcessingError): net.download(url, use_video_downloader=True)
net.download(url)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"url,expected_sha1", "url,expected_sha1",
[ [
( (
"https://www.youtube.com/watch?v=C0DPdy98e4c", "https://gfycat.com/immaterialchillyiberianmole",
"365af1c8f59c6865e1a84c6e13e3e25ff89e0ba1", "0125976d2439e651b6863438db30de58f79f7754",
), ),
( (
"https://gfycat.com/immaterialchillyiberianmole", "https://upload.wikimedia.org/wikipedia/commons/a/ad/Utah_teapot.png", # noqa: E501
"953000e81d7bd1da95ce264f872e7b6c4a6484be", "cfadcbdeda1204dc1363ee5c1969191f26be2e41",
), ),
], ],
) )
def test_video_download(url, expected_sha1): def test_content_download(url, expected_sha1):
pytest.xfail("Current youtube-dl implementation is unstable")
actual_content = net.download(url, use_video_downloader=True) actual_content = net.download(url, use_video_downloader=True)
assert get_sha1(actual_content) == expected_sha1 assert get_sha1(actual_content) == expected_sha1
@pytest.mark.parametrize( def test_bad_content_downlaod():
"url", url = "http://info.cern.ch/hypertext/WWW/TheProject.html"
[
"https://samples.ffmpeg.org/flac/short.flac", # not a video
"https://www.youtube.com/watch?v=dQw4w9WgXcQ", # video too large
],
)
def test_failed_video_download(url):
with pytest.raises(errors.ThirdPartyError): with pytest.raises(errors.ThirdPartyError):
net.download(url, use_video_downloader=True) net.download(url, use_video_downloader=True)