Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 138 additions & 5 deletions tests/unit/forklift/test_legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -4864,12 +4864,145 @@ def test_missing_trailing_slash_redirect(pyramid_request):
"https://github.com",
False,
),
( # Publisher URL is None
"https://github.com/owner/project",
None,
],
)
def test_verify_url_with_trusted_publisher(url, publisher_url, expected):
assert legacy._verify_url_with_trusted_publisher(url, publisher_url) == expected


@pytest.mark.parametrize(
("url", "project_name", "project_normalized_name", "expected"),
[
( # PyPI /project/ case
"https://pypi.org/project/myproject",
"myproject",
"myproject",
True,
),
( # PyPI /p/ case
"https://pypi.org/p/myproject",
"myproject",
"myproject",
True,
),
( # pypi.python.org /project/ case
"https://pypi.python.org/project/myproject",
"myproject",
"myproject",
True,
),
( # pypi.python.org /p/ case
"https://pypi.python.org/p/myproject",
"myproject",
"myproject",
True,
),
( # python.org/pypi/ case
"https://python.org/pypi/myproject",
"myproject",
"myproject",
True,
),
( # PyPI /project/ case
"https://pypi.org/project/myproject",
"myproject",
"myproject",
True,
),
( # Normalized name differs from URL
"https://pypi.org/project/my_project",
"my_project",
"my-project",
True,
),
( # Normalized name same as URL
"https://pypi.org/project/my-project",
"my_project",
"my-project",
True,
),
( # Trailing slash
"https://pypi.org/project/myproject/",
"myproject",
"myproject",
True,
),
( # Domains are case insensitive
"https://PyPI.org/project/myproject",
"myproject",
"myproject",
True,
),
( # Paths are case-sensitive
"https://pypi.org/Project/myproject",
"myproject",
"myproject",
False,
),
( # Wrong domain
"https://example.com/project/myproject",
"myproject",
"myproject",
False,
),
( # Wrong path
"https://pypi.org/something/myproject",
"myproject",
"myproject",
False,
),
( # Path has extra components
"https://pypi.org/something/myproject/something",
"myproject",
"myproject",
False,
),
( # Wrong package name
"https://pypi.org/project/otherproject",
"myproject",
"myproject",
False,
),
( # Similar package name
"https://pypi.org/project/myproject",
"myproject2",
"myproject2",
False,
),
( # Similar package name
"https://pypi.org/project/myproject2",
"myproject",
"myproject",
False,
),
],
)
def test_verify_url(url, publisher_url, expected):
assert legacy._verify_url(url, publisher_url) == expected
def test_verify_url_pypi(url, project_name, project_normalized_name, expected):
assert (
legacy._verify_url_pypi(url, project_name, project_normalized_name) == expected
)


def test_verify_url():
# `_verify_url` is just a helper function that calls `_verify_url_pypi` and
# `_verify_url_with_trusted_publisher`, where the actual verification logic lives.
assert legacy._verify_url(
url="https://pypi.org/project/myproject/",
publisher_url=None,
project_name="myproject",
project_normalized_name="myproject",
)

assert legacy._verify_url(
url="https://github.com/org/myproject/issues",
publisher_url="https://github.com/org/myproject",
project_name="myproject",
project_normalized_name="myproject",
)

assert not legacy._verify_url(
url="example.com",
publisher_url="https://github.com/or/myproject",
project_name="myproject",
project_normalized_name="myproject",
)
52 changes: 47 additions & 5 deletions warehouse/forklift/legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,31 @@ def _process_attestations(request, distribution: Distribution):
metrics.increment("warehouse.upload.attestations.ok")


def _verify_url(url: str, publisher_url: str | None) -> bool:
_pypi_project_urls = [
"https://pypi.org/project/",
"https://pypi.org/p/",
"https://pypi.python.org/project/",
"https://pypi.python.org/p/",
"https://python.org/pypi/",
]


def _verify_url_pypi(url: str, project_name: str, project_normalized_name: str) -> bool:
candidate_urls = (
f"{pypi_project_url}{name}{optional_slash}"
for pypi_project_url in _pypi_project_urls
for name in {project_name, project_normalized_name}
for optional_slash in ["/", ""]
)

user_uri = rfc3986.api.uri_reference(url).normalize()
return any(
user_uri == rfc3986.api.uri_reference(candidate_url).normalize()
for candidate_url in candidate_urls
)


def _verify_url_with_trusted_publisher(url: str, publisher_url: str) -> bool:
"""
Verify a given URL against a Trusted Publisher URL

Expand All @@ -473,9 +497,6 @@ def _verify_url(url: str, publisher_url: str | None) -> bool:
the authority includes the host, and in practice neither URL should have user
nor port information.
"""
if not publisher_url:
return False

publisher_uri = rfc3986.api.uri_reference(publisher_url).normalize()
user_uri = rfc3986.api.uri_reference(url).normalize()
if publisher_uri.path is None:
Expand All @@ -496,6 +517,22 @@ def _verify_url(url: str, publisher_url: str | None) -> bool:
)


def _verify_url(
url: str, publisher_url: str | None, project_name: str, project_normalized_name: str
) -> bool:
if _verify_url_pypi(
url=url,
project_name=project_name,
project_normalized_name=project_normalized_name,
):
return True

if not publisher_url:
return False

return _verify_url_with_trusted_publisher(url=url, publisher_url=publisher_url)


def _sort_releases(request: Request, project: Project):
releases = (
request.db.query(Release)
Expand Down Expand Up @@ -866,7 +903,12 @@ def file_upload(request):
else {
name: {
"url": url,
"verified": _verify_url(url=url, publisher_url=publisher_base_url),
"verified": _verify_url(
url=url,
publisher_url=publisher_base_url,
project_name=project.name,
project_normalized_name=project.normalized_name,
),
Comment on lines +906 to +911
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I would prefer something like this instead unless there's a strong reason not to:

Suggested change
"verified": _verify_url(
url=url,
publisher_url=publisher_base_url,
project_name=project.name,
project_normalized_name=project.normalized_name,
),
"verified": any(
check(
url=url,
publisher_url=publisher_base_url,
project_name=project.name,
project_normalized_name=project.normalized_name,
)
for check in [_verify_url_pypi, _verify_url_with_trusted_publisher]
),

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would mean we need to pass the same parameters to both _verify_* functions, so some of them will be unused in each.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that's OK? I'm concerned that _verify_url has logic that short-circuits verification outside of the individual checks, and thinking it'll be more straightforward to just write additional checks and add them to this list than determine where they should fall in the logic for that function.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(But also, want to ensure that we're still lazily evaluating these checks)

Copy link
Contributor Author

@facutuesca facutuesca Aug 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see your point. Could you check trail-of-forks#3013 ? It's my idea for moving the TP-specific verification to the OIDCPublisherMixin class, since we'll need to specialize them depending on the specific TP provider.

If we go ahead with that, at least one check will be a method of that class, and the proposed change here would have to be:

any(
    check(...)
    for check in [_verify_url_pypi, publisher.verify_url] if publisher else [_verify_url_pypi])

(or similar), and we would still need to keep the same common list of parameters for all verify functions.

If that's fine I can accept the suggestion. My only concern is that as verification logic grows, having all current and future functions take the same (possibly growing) list of parameters might not be great.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since we'll need to specialize them depending on the specific TP provider.

That's a good point. I think I'd be in favor of that refactor once we introduce more verifications. Then this could become something like:

Suggested change
"verified": _verify_url(
url=url,
publisher_url=publisher_base_url,
project_name=project.name,
project_normalized_name=project.normalized_name,
),
"verified": any(
check(url)
for check in [
partial(
_verify_url_pypi(
project_name=project_name,
project_normalized_name=project_normalized_name,
)
),
partial(publisher.verify_url) if publisher else lambda _: False
),

I think I'm over-optimizing a bit here though, going to approve/merge this as-is for now and we can revisit when things change.

}
for name, url in meta.project_urls.items()
}
Expand Down