-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Support rendering previews with data: URLs in them #11767
Changes from all commits
ff7c377
5b09001
fb448dd
9b98fea
f07df94
6cd9b5b
88917c0
b9a1c66
116015f
3b38444
e042244
deeec27
5124a94
d38a07c
043b4c1
2042da1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Fix a long-standing bug when previewing Reddit URLs which do not contain an image. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -321,14 +321,33 @@ def _iterate_over_text( | |
|
|
||
|
|
||
| def rebase_url(url: str, base: str) -> str: | ||
| base_parts = list(urlparse.urlparse(base)) | ||
| """ | ||
| Resolves a potentially relative `url` against an absolute `base` URL. | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you know how this differs from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't know of it, but I'll look into it! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this will work OK, but I'd rather punt this to a separate PR since it is pretty unrelated to this work. Would that be OK? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah OK |
||
|
|
||
| For example: | ||
|
|
||
| >>> rebase_url("subpage", "https://example.com/foo/") | ||
| 'https://example.com/foo/subpage' | ||
| >>> rebase_url("sibling", "https://example.com/foo") | ||
| 'https://example.com/sibling' | ||
| >>> rebase_url("/bar", "https://example.com/foo/") | ||
| 'https://example.com/bar' | ||
| >>> rebase_url("https://alice.com/a/", "https://example.com/foo/") | ||
| 'https://alice.com/a' | ||
| """ | ||
| base_parts = urlparse.urlparse(base) | ||
| # Convert the parsed URL to a list for (potential) modification. | ||
| url_parts = list(urlparse.urlparse(url)) | ||
| if not url_parts[0]: # fix up schema | ||
| url_parts[0] = base_parts[0] or "http" | ||
| if not url_parts[1]: # fix up hostname | ||
| url_parts[1] = base_parts[1] | ||
| # Add a scheme, if one does not exist. | ||
| if not url_parts[0]: | ||
| url_parts[0] = base_parts.scheme or "http" | ||
| # Fix up the hostname, if this is not a data URL. | ||
| if url_parts[0] != "data" and not url_parts[1]: | ||
| url_parts[1] = base_parts.netloc | ||
| # If the path does not start with a /, nest it under the base path's last | ||
| # directory. | ||
| if not url_parts[2].startswith("/"): | ||
| url_parts[2] = re.sub(r"/[^/]+$", "/", base_parts[2]) + url_parts[2] | ||
| url_parts[2] = re.sub(r"/[^/]+$", "/", base_parts.path) + url_parts[2] | ||
| return urlparse.urlunparse(url_parts) | ||
|
|
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,8 +21,9 @@ | |
| import shutil | ||
| import sys | ||
| import traceback | ||
| from typing import TYPE_CHECKING, Iterable, Optional, Tuple | ||
| from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple | ||
| from urllib import parse as urlparse | ||
| from urllib.request import urlopen | ||
|
|
||
| import attr | ||
|
|
||
|
|
@@ -70,6 +71,17 @@ | |
| IMAGE_CACHE_EXPIRY_MS = 2 * ONE_DAY | ||
|
|
||
|
|
||
| @attr.s(slots=True, frozen=True, auto_attribs=True) | ||
| class DownloadResult: | ||
| length: int | ||
| uri: str | ||
| response_code: int | ||
| media_type: str | ||
| download_name: Optional[str] | ||
| expires: int | ||
| etag: Optional[str] | ||
|
|
||
|
|
||
| @attr.s(slots=True, frozen=True, auto_attribs=True) | ||
| class MediaInfo: | ||
| """ | ||
|
|
@@ -256,7 +268,7 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes: | |
| if oembed_url: | ||
| url_to_download = oembed_url | ||
|
|
||
| media_info = await self._download_url(url_to_download, user) | ||
| media_info = await self._handle_url(url_to_download, user) | ||
|
|
||
| logger.debug("got media_info of '%s'", media_info) | ||
|
|
||
|
|
@@ -297,7 +309,9 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes: | |
| oembed_url = self._oembed.autodiscover_from_html(tree) | ||
| og_from_oembed: JsonDict = {} | ||
| if oembed_url: | ||
| oembed_info = await self._download_url(oembed_url, user) | ||
| oembed_info = await self._handle_url( | ||
| oembed_url, user, allow_data_urls=True | ||
| ) | ||
| ( | ||
| og_from_oembed, | ||
| author_name, | ||
|
|
@@ -367,7 +381,135 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes: | |
|
|
||
| return jsonog.encode("utf8") | ||
|
|
||
| async def _download_url(self, url: str, user: UserID) -> MediaInfo: | ||
| async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResult: | ||
| """ | ||
| Fetches a remote URL and parses the headers. | ||
|
|
||
| Args: | ||
| url: The URL to fetch. | ||
| output_stream: The stream to write the content to. | ||
|
|
||
| Returns: | ||
| A tuple of: | ||
| Media length, URL downloaded, the HTTP response code, | ||
| the media type, the downloaded file name, the number of | ||
| milliseconds the result is valid for, the etag header. | ||
| """ | ||
|
|
||
| try: | ||
| logger.debug("Trying to get preview for url '%s'", url) | ||
| length, headers, uri, code = await self.client.get_file( | ||
| url, | ||
| output_stream=output_stream, | ||
| max_size=self.max_spider_size, | ||
| headers={"Accept-Language": self.url_preview_accept_language}, | ||
| ) | ||
| except SynapseError: | ||
| # Pass SynapseErrors through directly, so that the servlet | ||
| # handler will return a SynapseError to the client instead of | ||
| # blank data or a 500. | ||
| raise | ||
| except DNSLookupError: | ||
| # DNS lookup returned no results | ||
| # Note: This will also be the case if one of the resolved IP | ||
| # addresses is blacklisted | ||
| raise SynapseError( | ||
| 502, | ||
| "DNS resolution failure during URL preview generation", | ||
| Codes.UNKNOWN, | ||
| ) | ||
| except Exception as e: | ||
| # FIXME: pass through 404s and other error messages nicely | ||
| logger.warning("Error downloading %s: %r", url, e) | ||
|
|
||
| raise SynapseError( | ||
| 500, | ||
| "Failed to download content: %s" | ||
| % (traceback.format_exception_only(sys.exc_info()[0], e),), | ||
| Codes.UNKNOWN, | ||
| ) | ||
|
|
||
| if b"Content-Type" in headers: | ||
| media_type = headers[b"Content-Type"][0].decode("ascii") | ||
| else: | ||
| media_type = "application/octet-stream" | ||
|
|
||
| download_name = get_filename_from_headers(headers) | ||
|
|
||
| # FIXME: we should calculate a proper expiration based on the | ||
| # Cache-Control and Expire headers. But for now, assume 1 hour. | ||
| expires = ONE_HOUR | ||
| etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None | ||
|
|
||
| return DownloadResult( | ||
| length, uri, code, media_type, download_name, expires, etag | ||
| ) | ||
|
|
||
| async def _parse_data_url( | ||
| self, url: str, output_stream: BinaryIO | ||
| ) -> DownloadResult: | ||
| """ | ||
| Parses a data: URL. | ||
|
|
||
| Args: | ||
| url: The URL to parse. | ||
| output_stream: The stream to write the content to. | ||
|
|
||
| Returns: | ||
| A tuple of: | ||
| Media length, URL downloaded, the HTTP response code, | ||
| the media type, the downloaded file name, the number of | ||
| milliseconds the result is valid for, the etag header. | ||
| """ | ||
|
|
||
| try: | ||
| logger.debug("Trying to parse data url '%s'", url) | ||
| with urlopen(url) as url_info: | ||
| # TODO Can this be more efficient. | ||
| output_stream.write(url_info.read()) | ||
| except Exception as e: | ||
| logger.warning("Error parsing data: URL %s: %r", url, e) | ||
|
|
||
| raise SynapseError( | ||
| 500, | ||
| "Failed to parse data URL: %s" | ||
| % (traceback.format_exception_only(sys.exc_info()[0], e),), | ||
| Codes.UNKNOWN, | ||
| ) | ||
|
|
||
| return DownloadResult( | ||
| # Read back the length that has been written. | ||
| length=output_stream.tell(), | ||
| uri=url, | ||
| # If it was parsed, consider this a 200 OK. | ||
| response_code=200, | ||
| # urlopen shoves the media-type from the data URL into the content type | ||
| # header object. | ||
| media_type=url_info.headers.get_content_type(), | ||
| # Some features are not supported by data: URLs. | ||
| download_name=None, | ||
| expires=ONE_HOUR, | ||
| etag=None, | ||
| ) | ||
|
|
||
| async def _handle_url( | ||
| self, url: str, user: UserID, allow_data_urls: bool = False | ||
| ) -> MediaInfo: | ||
| """ | ||
| Fetches content from a URL and parses the result to generate a MediaInfo. | ||
|
|
||
| It uses the media storage provider to persist the fetched content and | ||
| stores the mapping into the database. | ||
|
|
||
| Args: | ||
| url: The URL to fetch. | ||
| user: The user who ahs requested this URL. | ||
| allow_data_urls: True if data URLs should be allowed. | ||
|
|
||
| Returns: | ||
| A MediaInfo object describing the fetched content. | ||
| """ | ||
|
|
||
| # TODO: we should probably honour robots.txt... except in practice | ||
| # we're most likely being explicitly triggered by a human rather than a | ||
| # bot, so are we really a robot? | ||
|
|
@@ -377,61 +519,27 @@ async def _download_url(self, url: str, user: UserID) -> MediaInfo: | |
| file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True) | ||
|
|
||
| with self.media_storage.store_into_file(file_info) as (f, fname, finish): | ||
| try: | ||
| logger.debug("Trying to get preview for url '%s'", url) | ||
| length, headers, uri, code = await self.client.get_file( | ||
| url, | ||
| output_stream=f, | ||
| max_size=self.max_spider_size, | ||
| headers={"Accept-Language": self.url_preview_accept_language}, | ||
| ) | ||
| except SynapseError: | ||
| # Pass SynapseErrors through directly, so that the servlet | ||
| # handler will return a SynapseError to the client instead of | ||
| # blank data or a 500. | ||
| raise | ||
| except DNSLookupError: | ||
| # DNS lookup returned no results | ||
| # Note: This will also be the case if one of the resolved IP | ||
| # addresses is blacklisted | ||
| raise SynapseError( | ||
| 502, | ||
| "DNS resolution failure during URL preview generation", | ||
| Codes.UNKNOWN, | ||
| ) | ||
| except Exception as e: | ||
| # FIXME: pass through 404s and other error messages nicely | ||
| logger.warning("Error downloading %s: %r", url, e) | ||
|
|
||
| raise SynapseError( | ||
| 500, | ||
| "Failed to download content: %s" | ||
| % (traceback.format_exception_only(sys.exc_info()[0], e),), | ||
| Codes.UNKNOWN, | ||
| ) | ||
| await finish() | ||
| if url.startswith("data:"): | ||
| if not allow_data_urls: | ||
| raise SynapseError( | ||
| 500, "Previewing of data: URLs is forbidden", Codes.UNKNOWN | ||
| ) | ||
|
|
||
| if b"Content-Type" in headers: | ||
| media_type = headers[b"Content-Type"][0].decode("ascii") | ||
| download_result = await self._parse_data_url(url, f) | ||
| else: | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We might want to assert HTTP(S) schemes here as additional error checking. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I feel like I'd be happier having a few select allowed protocols rather than singling out There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, note that those will fail right now (we won't try them), but it would be better to be explicit, I think! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this might actually be tougher to get correct than I initially though since we seem to support previewing scheme-less URLs. I'm not sure it makes sense to refactor that as part of this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, fair enough. |
||
| media_type = "application/octet-stream" | ||
| download_result = await self._download_url(url, f) | ||
|
|
||
| download_name = get_filename_from_headers(headers) | ||
|
|
||
| # FIXME: we should calculate a proper expiration based on the | ||
| # Cache-Control and Expire headers. But for now, assume 1 hour. | ||
| expires = ONE_HOUR | ||
| etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None | ||
| await finish() | ||
|
|
||
| try: | ||
| time_now_ms = self.clock.time_msec() | ||
|
|
||
| await self.store.store_local_media( | ||
| media_id=file_id, | ||
| media_type=media_type, | ||
| media_type=download_result.media_type, | ||
| time_now_ms=time_now_ms, | ||
| upload_name=download_name, | ||
| media_length=length, | ||
| upload_name=download_result.download_name, | ||
| media_length=download_result.length, | ||
| user_id=user, | ||
| url_cache=url, | ||
| ) | ||
|
|
@@ -444,16 +552,16 @@ async def _download_url(self, url: str, user: UserID) -> MediaInfo: | |
| raise | ||
|
|
||
| return MediaInfo( | ||
| media_type=media_type, | ||
| media_length=length, | ||
| download_name=download_name, | ||
| media_type=download_result.media_type, | ||
| media_length=download_result.length, | ||
| download_name=download_result.download_name, | ||
| created_ts_ms=time_now_ms, | ||
| filesystem_id=file_id, | ||
| filename=fname, | ||
| uri=uri, | ||
| response_code=code, | ||
| expires=expires, | ||
| etag=etag, | ||
| uri=download_result.uri, | ||
| response_code=download_result.response_code, | ||
| expires=download_result.expires, | ||
| etag=download_result.etag, | ||
| ) | ||
|
|
||
| async def _precache_image_url( | ||
|
|
@@ -474,8 +582,8 @@ async def _precache_image_url( | |
| # FIXME: it might be cleaner to use the same flow as the main /preview_url | ||
| # request itself and benefit from the same caching etc. But for now we | ||
| # just rely on the caching on the master request to speed things up. | ||
| image_info = await self._download_url( | ||
| rebase_url(og["og:image"], media_info.uri), user | ||
| image_info = await self._handle_url( | ||
| rebase_url(og["og:image"], media_info.uri), user, allow_data_urls=True | ||
| ) | ||
|
|
||
| if _is_media(image_info.media_type): | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.