diff --git a/.appveyor.yml b/.appveyor.yml new file mode 100644 index 00000000..19f589e5 --- /dev/null +++ b/.appveyor.yml @@ -0,0 +1,38 @@ +# What Python version is installed where: +# http://www.appveyor.com/docs/installed-software#python + +# This configuration based on: +# https://github.com/cookiecutter/cookiecutter/blob/5e65edf4c340993f462ddeaf44f99eb6f9da66f9/appveyor.yml + +environment: + matrix: + - PYTHON: "C:\\Python36-x64" + TOX_ENV: "test-py36,codecov" + + - PYTHON: "C:\\Python37-x64" + TOX_ENV: "test-py37,codecov" + + - PYTHON: "C:\\Python38-x64" + TOX_ENV: "test-py38,codecov" + + +init: + - set OS=WINDOWS + - set PATH=%PYTHON%;%PYTHON%\Scripts;%PATH% + - "git config --system http.sslcainfo \"C:\\Program Files\\Git\\mingw64\\ssl\\certs\\ca-bundle.crt\"" + - "%PYTHON%/python -V" + - "%PYTHON%/python -c \"import struct;print(8 * struct.calcsize(\'P\'))\"" + - set + +install: + - "%PYTHON%/Scripts/easy_install -U pip" + - "%PYTHON%/Scripts/pip install -U --force-reinstall tox virtualenv wheel" + + +build: false # Not a C# project, build stuff at the test step instead. + +test_script: + - "%PYTHON%/Scripts/tox -e %TOX_ENV%" + +artifacts: + - path: dist\* diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 00000000..c7166536 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,9 @@ +[run] +branch = True +omit = */flycheck_* + +[report] +precision = 2 +exclude_lines = + if TYPE_CHECKING + \s*\.\.\.$ diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml new file mode 100644 index 00000000..b5b7e9eb --- /dev/null +++ b/.github/workflows/cicd.yml @@ -0,0 +1,238 @@ +# Docs: +# https://help.github.com/en/actions/automating-your-workflow-with-github-actions + + + +name: CI/CD + + +on: + push: + branches: ["master"] + pull_request: + branches: ["master"] + + +jobs: + + info: + + name: Workflow information + runs-on: ubuntu-latest + timeout-minutes: 1 + + steps: + + - name: Print GitHub Context + env: + GITHUB_CONTEXT: ${{ toJson(github) }} + run: echo "${GITHUB_CONTEXT}"; + + - name: Print Job Context + env: + JOB_CONTEXT: ${{ toJson(job) }} + run: echo "${JOB_CONTEXT}"; + + - name: Print Steps Context + env: + STEPS_CONTEXT: ${{ toJson(steps) }} + run: echo "${STEPS_CONTEXT}"; + + - name: Print Runner Context + env: + RUNNER_CONTEXT: ${{ toJson(runner) }} + run: echo "${RUNNER_CONTEXT}"; + + - name: Print Strategy Context + env: + STRATEGY_CONTEXT: ${{ toJson(strategy) }} + run: echo "${STRATEGY_CONTEXT}"; + + - name: Print Matrix Context + env: + MATRIX_CONTEXT: ${{ toJson(matrix) }} + run: echo "${MATRIX_CONTEXT}"; + + + flake8: + + name: Flake8 (linter) + + runs-on: ubuntu-latest + timeout-minutes: 5 + + steps: + + - name: Checkout source code + uses: actions/checkout@v2 + + - name: Install Python + uses: actions/setup-python@v1 + with: + python-version: "3.9" + + - name: Install Tox + run: pip install tox; + + - name: Run Flake8 + run: tox -e flake8; + + + black: + + name: Black (linter) + + runs-on: ubuntu-latest + timeout-minutes: 5 + + steps: + + - name: Checkout source code + uses: actions/checkout@v2 + + - name: Install Python + uses: actions/setup-python@v1 + with: + python-version: "3.9" + + - name: Install Tox + run: pip install tox; + + - name: Run Black + run: tox -e black; + + + mypy: + name: Mypy (static type checker) + + runs-on: ubuntu-latest + timeout-minutes: 5 + + steps: + + - name: Checkout source code + uses: actions/checkout@v2 + + - name: Install Python + uses: actions/setup-python@v1 + with: + python-version: "3.9" + + - name: Install Tox + run: pip install tox; + + - name: Run Mypy + run: tox -e mypy; + + + docs: + + name: Build documentation + + runs-on: ubuntu-latest + timeout-minutes: 5 + + steps: + + - name: Checkout source code + uses: actions/checkout@v2 + + - name: Install Python + uses: actions/setup-python@v1 + with: + python-version: "3.9" + + - name: Install Tox + run: pip install tox; + + - name: Build documentation + run: tox -e docs; + + + packaging: + name: Packaging + + runs-on: ubuntu-latest + timeout-minutes: 5 + + steps: + + - name: Checkout source code + uses: actions/checkout@v2 + + - name: Install Python + uses: actions/setup-python@v1 + with: + python-version: "3.9" + + - name: Install Tox + run: pip install tox; + + - name: Check packaging + run: tox -e packaging; + + + unit: + name: Unit Tests using Python ${{ matrix.python }} on Ubuntu + + needs: [flake8, black, mypy, docs, packaging] + + runs-on: ubuntu-latest + timeout-minutes: 30 + strategy: + matrix: + python: ["2.7", "3.5", "3.6", "3.7", "3.8", "3.9", "pypy2", "pypy3"] + + steps: + + - name: Checkout source code + uses: actions/checkout@v2 + + - name: Install Python + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python }} + + - name: Install Tox + run: pip install tox; + + - name: Run unit tests + shell: bash + # This hairy shell code is here to map the Python versions + # specified above to the equivalents used in Tox environments. + run: | + set -eux + py="${{ matrix.python }}"; + if [[ $py =~ pypy ]]; then # PyPy + py_test="${py}"; + else # CPython + py_test="py${py/./}"; # Add "py" prefix, remove "." + fi; + env_test="test-${py_test}-coverage_xml"; + echo "Test environment: ${env_test}"; + tox -e "${env_test}"; + tar cvzf pytest-logs.tgz ".tox/${env_test}/log"; + + - name: Upload pytest log artifact + if: failure() + uses: actions/upload-artifact@v1 + with: + name: pytest-logs + path: pytest-logs.tgz + + # Use the latest supported Python version for combining coverage to + # prevent parsing errors in older versions when looking at modern code. + - uses: "actions/setup-python@v2" + with: + python-version: "3.9" + + - name: "Upload coverage to Codecov" + uses: "codecov/codecov-action@v1" + with: + env_vars: GITHUB_REF,GITHUB_COMMIT,GITHUB_USER,GITHUB_WORKFLOW + fail_ci_if_error: true + env: + GITHUB_REF: ${{ github.ref }} + GITHUB_COMMIT: ${{ github.sha }} + GITHUB_USER: ${{ github.actor }} + GITHUB_WORKFLOW: ${{ github.workflow }} diff --git a/.gitignore b/.gitignore index 0ef6fd4d..35230642 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,5 @@ -docs/_build +/docs/_build/ tmp.py -htmlcov/ -.coverage.* *.py[cod] # emacs @@ -31,11 +29,23 @@ lib64 # Installer logs pip-log.txt -# Unit test / coverage reports -.coverage -.tox +# Testing +/.tox/ +/.hypothesis/ nosetests.xml +# Coverage +/.coverage +/.coverage.* +/htmlcov/ +/.mypy_cache/ + +# Documentation +/htmldocs/ + +# Documentation +/htmldocs/ + # Translations *.mo diff --git a/.tox-coveragerc b/.tox-coveragerc deleted file mode 100644 index 44178a43..00000000 --- a/.tox-coveragerc +++ /dev/null @@ -1,14 +0,0 @@ -[run] -branch = True -source = - hyperlink - ../hyperlink -omit = - */flycheck_* - -[paths] -source = - ../hyperlink - */lib/python*/site-packages/hyperlink - */Lib/site-packages/hyperlink - */pypy/site-packages/hyperlink diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 1aff2d28..00000000 --- a/.travis.yml +++ /dev/null @@ -1,39 +0,0 @@ -sudo: false -cache: -directories: -- $HOME/.cache/pip - -language: python - - -matrix: - include: - - python: "2.7" - env: TOXENV=py27 - - python: "3.4" - env: TOXENV=py34 - - python: "3.5" - env: TOXENV=py35 - - python: "3.6" - env: TOXENV=py36 - - python: "pypy" - env: TOXENV=pypy - - python: "2.7" - env: TOXENV=packaging - - -install: - - "pip install -r requirements-test.txt" - -script: - - tox - - -before_install: - - pip install codecov coverage - - -after_success: - - tox -e coverage-report - - COVERAGE_FILE=.tox/.coverage coverage xml - - codecov -f coverage.xml diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bc4f61e..50f34c6a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,42 @@ # Hyperlink Changelog -## dev (not yet released) +## Next + +* CPython 3.9 added to test matrix + +## 21.0.0 + +*(January 7, 2021)* + +* Update plus sign (`+`) handling to work with/like HTML form encoding + (`POST`) by default, fixes [#129][i129], and associated roundtripping ([#146][i146]). +* Package IDNA tables. ([#134][i134]) +* Long overdue dependency bumps + +[i129]: https://github.com/python-hyper/hyperlink/issues/129 +[i134]: https://github.com/python-hyper/hyperlink/issues/134 +[i146]: https://github.com/python-hyper/hyperlink/issues/146 + +## 20.0.1 + +*(August 4, 2020)* + +Rerelease to fix packaging metadata around conditional requirements. +See [issue #133](https://github.com/python-hyper/hyperlink/issues/133) +for more details. + +## 20.0.0 + +*(August 3, 2020)* + +* CPython 3.7 and 3.8 and PyPy3 added to test matrix +* Hyperlink now has type hints and they are now exported per + [PEP 561](https://www.python.org/dev/peps/pep-0561/). +* Several bugs related to hidden state were fixed, making it so that all data + on a `URL` object (including `rooted` and `uses_netloc`) is reflected by and + consistent with its textual representation. + This does mean that sometimes these constructor arguments are ignored, if it + would create invalid or unparseable URL text. ## 19.0.0 @@ -13,7 +49,8 @@ A query parameter-centric release, with two enhancements: [#39](https://github.com/python-hyper/hyperlink/pull/39)) * `URL.remove()` now accepts *value* and *limit* parameters, allowing for removal of specific name-value pairs, as well as limiting the - number of removals. (see [#71](https://github.com/python-hyper/hyperlink/pull/71)) + number of removals. + (See [#71](https://github.com/python-hyper/hyperlink/pull/71)) ## 18.0.0 diff --git a/LICENSE b/LICENSE index 30953dde..a73f882f 100644 --- a/LICENSE +++ b/LICENSE @@ -5,6 +5,7 @@ Jean Paul Calderone Adi Roiban Amber Hawkie Brown Mahmoud Hashemi +Wilfredo Sanchez Vega and others that have contributed code to the public domain. diff --git a/MANIFEST.in b/MANIFEST.in index c4b6e32f..5869a052 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,9 @@ -include README.md LICENSE CHANGELOG.md tox.ini requirements-test.txt .coveragerc Makefile pytest.ini .tox-coveragerc -exclude TODO.md appveyor.yml +include README.md LICENSE CHANGELOG.md +include tox.ini pytest.ini .coveragerc +exclude TODO.md +exclude .appveyor.yml + +include src/hyperlink/idna-tables-properties.csv.gz graft docs prune docs/_build diff --git a/README.md b/README.md index bca5d5f0..017f9eb8 100644 --- a/README.md +++ b/README.md @@ -2,15 +2,30 @@ *Cool URLs that don't change.* - - - + + Documentation + + + PyPI + + + Calendar Versioning + + + Python Version Compatibility + + + Code Coverage + + + Requirements Status + Hyperlink provides a pure-Python implementation of immutable URLs. Based on [RFC 3986][rfc3986] and [3987][rfc3987], the Hyperlink URL makes working with both URIs and IRIs easy. -Hyperlink is tested against Python 2.7, 3.4, 3.5, 3.6, and PyPy. +Hyperlink is tested against Python 2.7, 3.4, 3.5, 3.6, 3.7, 3.8, and PyPy. Full documentation is available on [Read the Docs][docs]. diff --git a/TODO.md b/TODO.md index e8ac57ab..f5d2fdda 100644 --- a/TODO.md +++ b/TODO.md @@ -29,7 +29,7 @@ * Speed up percent encoding with urlutils approach * More default ports * resolve dots on (empty) click -* better error on URL constructor (single string argument leads to succesful instantiation with invalid scheme) +* better error on URL constructor (single string argument leads to successful instantiation with invalid scheme) * pct encode userinfo * `__hash__` * README diff --git a/appveyor.yml b/appveyor.yml deleted file mode 100644 index 5f146255..00000000 --- a/appveyor.yml +++ /dev/null @@ -1,50 +0,0 @@ -# What Python version is installed where: -# http://www.appveyor.com/docs/installed-software#python - -# This configuration based on: -# https://github.com/audreyr/cookiecutter/commit/3c4685f536afda3be93da3fe3039cec0ab0d60a3 - -environment: - matrix: - - PYTHON: "C:\\Python27-x64" - TOX_ENV: "py27" - - - PYTHON: "C:\\Python36-x64" - TOX_ENV: "py36" - - -init: - - set PATH=%PYTHON%;%PYTHON%\Scripts;%PATH% - - "git config --system http.sslcainfo \"C:\\Program Files\\Git\\mingw64\\ssl\\certs\\ca-bundle.crt\"" - - "%PYTHON%/python -V" - - "%PYTHON%/python -c \"import struct;print(8 * struct.calcsize(\'P\'))\"" - -install: - - "%PYTHON%/Scripts/easy_install -U pip" - - "%PYTHON%/Scripts/pip install -U --force-reinstall tox wheel" - - -build: false # Not a C# project, build stuff at the test step instead. - -test_script: - - "%PYTHON%/Scripts/tox -e %TOX_ENV%" - -after_test: - - "%PYTHON%/python setup.py bdist_wheel" - - ps: "ls dist" - -on_success: - # Report coverage results to codecov.io - # and export tox environment variables - - "%PYTHON%/Scripts/tox -e coverage-report" - - "%PYTHON%/Scripts/pip install codecov coverage" - - set COVERAGE_FILE=.tox/.coverage - - "%PYTHON%/Scripts/coverage xml" - - set OS=WINDOWS - - "%PYTHON%/Scripts/codecov -f coverage.xml -e TOX_ENV OS" - -artifacts: - - path: dist\* - -#on_success: -# - TODO: upload the content of dist/*.whl to a public wheelhouse diff --git a/docs/api.rst b/docs/api.rst index 47854540..93ebb782 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -5,11 +5,43 @@ Hyperlink API .. automodule:: hyperlink._url +.. contents:: + :local: + Creation -------- -Before you can work with URLs, you must create URLs. There are two -ways to create URLs, from parts and from text. +Before you can work with URLs, you must create URLs. + +Parsing Text +^^^^^^^^^^^^ + +If you already have a textual URL, the easiest way to get URL objects +is with the :func:`parse()` function: + +.. autofunction:: hyperlink.parse + +By default, :func:`~hyperlink.parse()` returns an instance of +:class:`DecodedURL`, a URL type that handles all encoding for you, by +wrapping the lower-level :class:`URL`. + +DecodedURL +^^^^^^^^^^ + +.. autoclass:: hyperlink.DecodedURL +.. automethod:: hyperlink.DecodedURL.from_text + +The Encoded URL +^^^^^^^^^^^^^^^ + +The lower-level :class:`URL` looks very similar to the +:class:`DecodedURL`, but does not handle all encoding cases for +you. Use with caution. + +.. note:: + + :class:`URL` is also available as an alias, + ``hyperlink.EncodedURL`` for more explicit usage. .. autoclass:: hyperlink.URL .. automethod:: hyperlink.URL.from_text @@ -61,7 +93,6 @@ URLs have many parts, and URL objects have many attributes to represent them. .. autoattribute:: hyperlink.URL.userinfo .. autoattribute:: hyperlink.URL.user .. autoattribute:: hyperlink.URL.rooted -.. autoattribute:: hyperlink.URL.family Low-level functions ------------------- @@ -70,6 +101,6 @@ A couple of notable helpers used by the :class:`~hyperlink.URL` type. .. autoclass:: hyperlink.URLParseError .. autofunction:: hyperlink.register_scheme -.. autofunction:: hyperlink.parse_host +.. autofunction:: hyperlink.parse .. TODO: run doctests in docs? diff --git a/docs/conf.py b/docs/conf.py index 0eb8cf66..f8a4fb98 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -61,11 +61,11 @@ # General information about the project. project = u'hyperlink' -copyright = u'2018, Mahmoud Hashemi' +copyright = u'2021, Mahmoud Hashemi' author = u'Mahmoud Hashemi' -version = '19.0' -release = '19.0.0' +version = '21.0' +release = '21.0.0' if os.name != 'nt': today_fmt = '%B %d, %Y' @@ -76,7 +76,7 @@ pygments_style = 'sphinx' # Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'python': ('https://docs.python.org/2.7', None)} +intersphinx_mapping = {'python': ('https://docs.python.org/3.7', None)} # -- Options for HTML output ---------------------------------------------- diff --git a/docs/index.rst b/docs/index.rst index 2e65635d..cfc0c47d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,7 +10,7 @@ hyperlink URLs. Based on `RFC 3986`_ and `RFC 3987`_, the Hyperlink URL balances simplicity and correctness for both :ref:`URIs and IRIs `. -Hyperlink is tested against Python 2.7, 3.4, 3.5, 3.6, and PyPy. +Hyperlink is tested against Python 2.7, 3.4, 3.5, 3.6, 3.7, 3.8, and PyPy. For an introduction to the hyperlink library, its background, and URLs in general, see `this talk from PyConWeb 2017`_ (and `the accompanying @@ -39,9 +39,9 @@ library. The easiest way to install is with pip:: Then, URLs are just an import away:: - from hyperlink import URL + import hyperlink - url = URL.from_text(u'http://github.com/python-hyper/hyperlink?utm_source=readthedocs') + url = hyperlink.parse(u'http://github.com/python-hyper/hyperlink?utm_source=readthedocs') better_url = url.replace(scheme=u'https', port=443) org_url = better_url.click(u'.') @@ -49,7 +49,7 @@ Then, URLs are just an import away:: print(org_url.to_text()) # prints: https://github.com/python-hyper/ - print(better_url.get(u'utm_source')) + print(better_url.get(u'utm_source')[0]) # prints: readthedocs See :ref:`the API docs ` for more usage examples. diff --git a/hyperlink/__init__.py b/hyperlink/__init__.py deleted file mode 100644 index a027d52d..00000000 --- a/hyperlink/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ - -from ._url import (URL, - parse, - EncodedURL, - DecodedURL, - URLParseError, - register_scheme) - -__all__ = [ - "URL", - "parse", - "EncodedURL", - "DecodedURL", - "URLParseError", - "register_scheme", -] diff --git a/hyperlink/_url.py b/hyperlink/_url.py deleted file mode 100644 index 50e8535e..00000000 --- a/hyperlink/_url.py +++ /dev/null @@ -1,1921 +0,0 @@ -# -*- coding: utf-8 -*- -u"""Hyperlink provides Pythonic URL parsing, construction, and rendering. - -Usage is straightforward:: - - >>> from hyperlink import URL - >>> url = URL.from_text(u'http://github.com/mahmoud/hyperlink?utm_source=docs') - >>> url.host - u'github.com' - >>> secure_url = url.replace(scheme=u'https') - >>> secure_url.get('utm_source')[0] - u'docs' - -As seen here, the API revolves around the lightweight and immutable -:class:`URL` type, documented below. -""" - -import re -import sys -import string -import socket -from unicodedata import normalize -try: - from socket import inet_pton -except ImportError: - inet_pton = None # defined below -try: - from collections.abc import Mapping -except ImportError: # Python 2 - from collections import Mapping - -# Note: IDNAError is a subclass of UnicodeError -from idna import encode as idna_encode, decode as idna_decode, IDNAError - - -if inet_pton is None: - # based on https://gist.github.com/nnemkin/4966028 - # this code only applies on Windows Python 2.7 - import ctypes - - class _sockaddr(ctypes.Structure): - _fields_ = [("sa_family", ctypes.c_short), - ("__pad1", ctypes.c_ushort), - ("ipv4_addr", ctypes.c_byte * 4), - ("ipv6_addr", ctypes.c_byte * 16), - ("__pad2", ctypes.c_ulong)] - - WSAStringToAddressA = ctypes.windll.ws2_32.WSAStringToAddressA - WSAAddressToStringA = ctypes.windll.ws2_32.WSAAddressToStringA - - def inet_pton(address_family, ip_string): - addr = _sockaddr() - ip_string = ip_string.encode('ascii') - addr.sa_family = address_family - addr_size = ctypes.c_int(ctypes.sizeof(addr)) - - if WSAStringToAddressA(ip_string, address_family, None, ctypes.byref(addr), ctypes.byref(addr_size)) != 0: - raise socket.error(ctypes.FormatError()) - - if address_family == socket.AF_INET: - return ctypes.string_at(addr.ipv4_addr, 4) - if address_family == socket.AF_INET6: - return ctypes.string_at(addr.ipv6_addr, 16) - raise socket.error('unknown address family') - - -PY2 = (sys.version_info[0] == 2) -unicode = type(u'') -try: - unichr -except NameError: - unichr = chr # py3 -NoneType = type(None) - - -# from boltons.typeutils -def make_sentinel(name='_MISSING', var_name=None): - """Creates and returns a new **instance** of a new class, suitable for - usage as a "sentinel", a kind of singleton often used to indicate - a value is missing when ``None`` is a valid input. - - Args: - name (str): Name of the Sentinel - var_name (str): Set this name to the name of the variable in - its respective module enable pickleability. - - >>> make_sentinel(var_name='_MISSING') - _MISSING - - The most common use cases here in boltons are as default values - for optional function arguments, partly because of its - less-confusing appearance in automatically generated - documentation. Sentinels also function well as placeholders in queues - and linked lists. - - .. note:: - - By design, additional calls to ``make_sentinel`` with the same - values will not produce equivalent objects. - - >>> make_sentinel('TEST') == make_sentinel('TEST') - False - >>> type(make_sentinel('TEST')) == type(make_sentinel('TEST')) - False - - """ - class Sentinel(object): - def __init__(self): - self.name = name - self.var_name = var_name - - def __repr__(self): - if self.var_name: - return self.var_name - return '%s(%r)' % (self.__class__.__name__, self.name) - if var_name: - def __reduce__(self): - return self.var_name - - def __nonzero__(self): - return False - - __bool__ = __nonzero__ - - return Sentinel() - - -_unspecified = _UNSET = make_sentinel('_UNSET') - - -# RFC 3986 Section 2.3, Unreserved URI Characters -# https://tools.ietf.org/html/rfc3986#section-2.3 -_UNRESERVED_CHARS = frozenset('~-._0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ' - 'abcdefghijklmnopqrstuvwxyz') - - -# URL parsing regex (based on RFC 3986 Appendix B, with modifications) -_URL_RE = re.compile(r'^((?P[^:/?#]+):)?' - r'((?P<_netloc_sep>//)' - r'(?P[^/?#]*))?' - r'(?P[^?#]*)' - r'(\?(?P[^#]*))?' - r'(#(?P.*))?$') -_SCHEME_RE = re.compile(r'^[a-zA-Z0-9+-.]*$') -_AUTHORITY_RE = re.compile(r'^(?:(?P[^@/?#]*)@)?' - r'(?P' - r'(?:\[(?P[^[\]/?#]*)\])' - r'|(?P[^:/?#[\]]*)' - r'|(?P.*?))?' - r'(?::(?P.*))?$') - - -_HEX_CHAR_MAP = dict([((a + b).encode('ascii'), - unichr(int(a + b, 16)).encode('charmap')) - for a in string.hexdigits for b in string.hexdigits]) -_ASCII_RE = re.compile('([\x00-\x7f]+)') - -# RFC 3986 section 2.2, Reserved Characters -# https://tools.ietf.org/html/rfc3986#section-2.2 -_GEN_DELIMS = frozenset(u':/?#[]@') -_SUB_DELIMS = frozenset(u"!$&'()*+,;=") -_ALL_DELIMS = _GEN_DELIMS | _SUB_DELIMS - -_USERINFO_SAFE = _UNRESERVED_CHARS | _SUB_DELIMS | set(u'%') -_USERINFO_DELIMS = _ALL_DELIMS - _USERINFO_SAFE -_PATH_SAFE = _USERINFO_SAFE | set(u':@') -_PATH_DELIMS = _ALL_DELIMS - _PATH_SAFE -_SCHEMELESS_PATH_SAFE = _PATH_SAFE - set(':') -_SCHEMELESS_PATH_DELIMS = _ALL_DELIMS - _SCHEMELESS_PATH_SAFE -_FRAGMENT_SAFE = _UNRESERVED_CHARS | _PATH_SAFE | set(u'/?') -_FRAGMENT_DELIMS = _ALL_DELIMS - _FRAGMENT_SAFE -_QUERY_VALUE_SAFE = _UNRESERVED_CHARS | _FRAGMENT_SAFE - set(u'&+') -_QUERY_VALUE_DELIMS = _ALL_DELIMS - _QUERY_VALUE_SAFE -_QUERY_KEY_SAFE = _UNRESERVED_CHARS | _QUERY_VALUE_SAFE - set(u'=') -_QUERY_KEY_DELIMS = _ALL_DELIMS - _QUERY_KEY_SAFE - - -def _make_decode_map(delims, allow_percent=False): - ret = dict(_HEX_CHAR_MAP) - if not allow_percent: - delims = set(delims) | set([u'%']) - for delim in delims: - _hexord = '{0:02X}'.format(ord(delim)).encode('ascii') - _hexord_lower = _hexord.lower() - ret.pop(_hexord) - if _hexord != _hexord_lower: - ret.pop(_hexord_lower) - return ret - - -def _make_quote_map(safe_chars): - ret = {} - # v is included in the dict for py3 mostly, because bytestrings - # are iterables of ints, of course! - for i, v in zip(range(256), range(256)): - c = chr(v) - if c in safe_chars: - ret[c] = ret[v] = c - else: - ret[c] = ret[v] = '%{0:02X}'.format(i) - return ret - - -_USERINFO_PART_QUOTE_MAP = _make_quote_map(_USERINFO_SAFE) -_USERINFO_DECODE_MAP = _make_decode_map(_USERINFO_DELIMS) -_PATH_PART_QUOTE_MAP = _make_quote_map(_PATH_SAFE) -_SCHEMELESS_PATH_PART_QUOTE_MAP = _make_quote_map(_SCHEMELESS_PATH_SAFE) -_PATH_DECODE_MAP = _make_decode_map(_PATH_DELIMS) -_QUERY_KEY_QUOTE_MAP = _make_quote_map(_QUERY_KEY_SAFE) -_QUERY_KEY_DECODE_MAP = _make_decode_map(_QUERY_KEY_DELIMS) -_QUERY_VALUE_QUOTE_MAP = _make_quote_map(_QUERY_VALUE_SAFE) -_QUERY_VALUE_DECODE_MAP = _make_decode_map(_QUERY_VALUE_DELIMS) -_FRAGMENT_QUOTE_MAP = _make_quote_map(_FRAGMENT_SAFE) -_FRAGMENT_DECODE_MAP = _make_decode_map(_FRAGMENT_DELIMS) -_UNRESERVED_QUOTE_MAP = _make_quote_map(_UNRESERVED_CHARS) -_UNRESERVED_DECODE_MAP = dict([(k, v) for k, v in _HEX_CHAR_MAP.items() - if v.decode('ascii', 'replace') - in _UNRESERVED_CHARS]) - -_ROOT_PATHS = frozenset(((), (u'',))) - - -def _encode_reserved(text, maximal=True): - """A very comprehensive percent encoding for encoding all - delimiters. Used for arguments to DecodedURL, where a % means a - percent sign, and not the character used by URLs for escaping - bytes. - """ - if maximal: - bytestr = normalize('NFC', text).encode('utf8') - return u''.join([_UNRESERVED_QUOTE_MAP[b] for b in bytestr]) - return u''.join([_UNRESERVED_QUOTE_MAP[t] if t in _UNRESERVED_CHARS - else t for t in text]) - - -def _encode_path_part(text, maximal=True): - "Percent-encode a single segment of a URL path." - if maximal: - bytestr = normalize('NFC', text).encode('utf8') - return u''.join([_PATH_PART_QUOTE_MAP[b] for b in bytestr]) - return u''.join([_PATH_PART_QUOTE_MAP[t] if t in _PATH_DELIMS else t - for t in text]) - - -def _encode_schemeless_path_part(text, maximal=True): - """Percent-encode the first segment of a URL path for a URL without a - scheme specified. - """ - if maximal: - bytestr = normalize('NFC', text).encode('utf8') - return u''.join([_SCHEMELESS_PATH_PART_QUOTE_MAP[b] for b in bytestr]) - return u''.join([_SCHEMELESS_PATH_PART_QUOTE_MAP[t] - if t in _SCHEMELESS_PATH_DELIMS else t for t in text]) - - -def _encode_path_parts(text_parts, rooted=False, has_scheme=True, - has_authority=True, joined=True, maximal=True): - """ - Percent-encode a tuple of path parts into a complete path. - - Setting *maximal* to False percent-encodes only the reserved - characters that are syntactically necessary for serialization, - preserving any IRI-style textual data. - - Leaving *maximal* set to its default True percent-encodes - everything required to convert a portion of an IRI to a portion of - a URI. - - RFC 3986 3.3: - - If a URI contains an authority component, then the path component - must either be empty or begin with a slash ("/") character. If a URI - does not contain an authority component, then the path cannot begin - with two slash characters ("//"). In addition, a URI reference - (Section 4.1) may be a relative-path reference, in which case the - first path segment cannot contain a colon (":") character. - """ - if not text_parts: - return u'' if joined else text_parts - if rooted: - text_parts = (u'',) + text_parts - # elif has_authority and text_parts: - # raise Exception('see rfc above') # TODO: too late to fail like this? - encoded_parts = [] - if has_scheme: - encoded_parts = [_encode_path_part(part, maximal=maximal) - if part else part for part in text_parts] - else: - encoded_parts = [_encode_schemeless_path_part(text_parts[0])] - encoded_parts.extend([_encode_path_part(part, maximal=maximal) - if part else part for part in text_parts[1:]]) - if joined: - return u'/'.join(encoded_parts) - return tuple(encoded_parts) - - -def _encode_query_key(text, maximal=True): - """ - Percent-encode a single query string key or value. - """ - if maximal: - bytestr = normalize('NFC', text).encode('utf8') - return u''.join([_QUERY_KEY_QUOTE_MAP[b] for b in bytestr]) - return u''.join([_QUERY_KEY_QUOTE_MAP[t] if t in _QUERY_KEY_DELIMS else t - for t in text]) - - -def _encode_query_value(text, maximal=True): - """ - Percent-encode a single query string key or value. - """ - if maximal: - bytestr = normalize('NFC', text).encode('utf8') - return u''.join([_QUERY_VALUE_QUOTE_MAP[b] for b in bytestr]) - return u''.join([_QUERY_VALUE_QUOTE_MAP[t] - if t in _QUERY_VALUE_DELIMS else t for t in text]) - - -def _encode_fragment_part(text, maximal=True): - """Quote the fragment part of the URL. Fragments don't have - subdelimiters, so the whole URL fragment can be passed. - """ - if maximal: - bytestr = normalize('NFC', text).encode('utf8') - return u''.join([_FRAGMENT_QUOTE_MAP[b] for b in bytestr]) - return u''.join([_FRAGMENT_QUOTE_MAP[t] if t in _FRAGMENT_DELIMS else t - for t in text]) - - -def _encode_userinfo_part(text, maximal=True): - """Quote special characters in either the username or password - section of the URL. - """ - if maximal: - bytestr = normalize('NFC', text).encode('utf8') - return u''.join([_USERINFO_PART_QUOTE_MAP[b] for b in bytestr]) - return u''.join([_USERINFO_PART_QUOTE_MAP[t] if t in _USERINFO_DELIMS - else t for t in text]) - - - -# This port list painstakingly curated by hand searching through -# https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml -# and -# https://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xhtml -SCHEME_PORT_MAP = {'acap': 674, 'afp': 548, 'dict': 2628, 'dns': 53, - 'file': None, 'ftp': 21, 'git': 9418, 'gopher': 70, - 'http': 80, 'https': 443, 'imap': 143, 'ipp': 631, - 'ipps': 631, 'irc': 194, 'ircs': 6697, 'ldap': 389, - 'ldaps': 636, 'mms': 1755, 'msrp': 2855, 'msrps': None, - 'mtqp': 1038, 'nfs': 111, 'nntp': 119, 'nntps': 563, - 'pop': 110, 'prospero': 1525, 'redis': 6379, 'rsync': 873, - 'rtsp': 554, 'rtsps': 322, 'rtspu': 5005, 'sftp': 22, - 'smb': 445, 'snmp': 161, 'ssh': 22, 'steam': None, - 'svn': 3690, 'telnet': 23, 'ventrilo': 3784, 'vnc': 5900, - 'wais': 210, 'ws': 80, 'wss': 443, 'xmpp': None} - -# This list of schemes that don't use authorities is also from the link above. -NO_NETLOC_SCHEMES = set(['urn', 'about', 'bitcoin', 'blob', 'data', 'geo', - 'magnet', 'mailto', 'news', 'pkcs11', - 'sip', 'sips', 'tel']) -# As of Mar 11, 2017, there were 44 netloc schemes, and 13 non-netloc - - -def register_scheme(text, uses_netloc=True, default_port=None): - """Registers new scheme information, resulting in correct port and - slash behavior from the URL object. There are dozens of standard - schemes preregistered, so this function is mostly meant for - proprietary internal customizations or stopgaps on missing - standards information. If a scheme seems to be missing, please - `file an issue`_! - - Args: - text (unicode): Text representing the scheme. - (the 'http' in 'http://hatnote.com') - uses_netloc (bool): Does the scheme support specifying a - network host? For instance, "http" does, "mailto" does - not. Defaults to True. - default_port (int): The default port, if any, for netloc-using - schemes. - - .. _file an issue: https://github.com/mahmoud/hyperlink/issues - - """ - text = text.lower() - if default_port is not None: - try: - default_port = int(default_port) - except (ValueError, TypeError): - raise ValueError('default_port expected integer or None, not %r' - % (default_port,)) - - if uses_netloc is True: - SCHEME_PORT_MAP[text] = default_port - elif uses_netloc is False: - if default_port is not None: - raise ValueError('unexpected default port while specifying' - ' non-netloc scheme: %r' % default_port) - NO_NETLOC_SCHEMES.add(text) - else: - raise ValueError('uses_netloc expected bool, not: %r' % uses_netloc) - - return - - -def scheme_uses_netloc(scheme, default=None): - """Whether or not a URL uses :code:`:` or :code:`://` to separate the - scheme from the rest of the URL depends on the scheme's own - standard definition. There is no way to infer this behavior - from other parts of the URL. A scheme either supports network - locations or it does not. - - The URL type's approach to this is to check for explicitly - registered schemes, with common schemes like HTTP - preregistered. This is the same approach taken by - :mod:`urlparse`. - - URL adds two additional heuristics if the scheme as a whole is - not registered. First, it attempts to check the subpart of the - scheme after the last ``+`` character. This adds intuitive - behavior for schemes like ``git+ssh``. Second, if a URL with - an unrecognized scheme is loaded, it will maintain the - separator it sees. - """ - if not scheme: - return False - scheme = scheme.lower() - if scheme in SCHEME_PORT_MAP: - return True - if scheme in NO_NETLOC_SCHEMES: - return False - if scheme.split('+')[-1] in SCHEME_PORT_MAP: - return True - return default - - -class URLParseError(ValueError): - """Exception inheriting from :exc:`ValueError`, raised when failing to - parse a URL. Mostly raised on invalid ports and IPv6 addresses. - """ - pass - - -def _optional(argument, default): - if argument is _UNSET: - return default - else: - return argument - - -def _typecheck(name, value, *types): - """ - Check that the given *value* is one of the given *types*, or raise an - exception describing the problem using *name*. - """ - if not types: - raise ValueError('expected one or more types, maybe use _textcheck?') - if not isinstance(value, types): - raise TypeError("expected %s for %s, got %r" - % (" or ".join([t.__name__ for t in types]), - name, value)) - return value - - -def _textcheck(name, value, delims=frozenset(), nullable=False): - if not isinstance(value, unicode): - if nullable and value is None: - return value # used by query string values - else: - str_name = "unicode" if PY2 else "str" - exp = str_name + ' or NoneType' if nullable else str_name - raise TypeError('expected %s for %s, got %r' % (exp, name, value)) - if delims and set(value) & set(delims): # TODO: test caching into regexes - raise ValueError('one or more reserved delimiters %s present in %s: %r' - % (''.join(delims), name, value)) - return value - - -def iter_pairs(iterable): - """ - Iterate over the (key, value) pairs in ``iterable``. - - This handles dictionaries sensibly, and falls back to assuming the - iterable yields (key, value) pairs. This behaviour is similar to - what Python's ``dict()`` constructor does. - """ - if isinstance(iterable, Mapping): - iterable = iterable.items() - return iter(iterable) - - -def _decode_unreserved(text, normalize_case=False, encode_stray_percents=False): - return _percent_decode(text, normalize_case=normalize_case, - encode_stray_percents=encode_stray_percents, - _decode_map=_UNRESERVED_DECODE_MAP) - - -def _decode_userinfo_part(text, normalize_case=False, encode_stray_percents=False): - return _percent_decode(text, normalize_case=normalize_case, - encode_stray_percents=encode_stray_percents, - _decode_map=_USERINFO_DECODE_MAP) - - -def _decode_path_part(text, normalize_case=False, encode_stray_percents=False): - """ - >>> _decode_path_part(u'%61%77%2f%7a') - u'aw%2fz' - >>> _decode_path_part(u'%61%77%2f%7a', normalize_case=True) - u'aw%2Fz' - """ - return _percent_decode(text, normalize_case=normalize_case, - encode_stray_percents=encode_stray_percents, - _decode_map=_PATH_DECODE_MAP) - - -def _decode_query_key(text, normalize_case=False, encode_stray_percents=False): - return _percent_decode(text, normalize_case=normalize_case, - encode_stray_percents=encode_stray_percents, - _decode_map=_QUERY_KEY_DECODE_MAP) - - -def _decode_query_value(text, normalize_case=False, encode_stray_percents=False): - return _percent_decode(text, normalize_case=normalize_case, - encode_stray_percents=encode_stray_percents, - _decode_map=_QUERY_VALUE_DECODE_MAP) - - -def _decode_fragment_part(text, normalize_case=False, encode_stray_percents=False): - return _percent_decode(text, normalize_case=normalize_case, - encode_stray_percents=encode_stray_percents, - _decode_map=_FRAGMENT_DECODE_MAP) - - -def _percent_decode(text, normalize_case=False, subencoding='utf-8', - raise_subencoding_exc=False, encode_stray_percents=False, - _decode_map=_HEX_CHAR_MAP): - """Convert percent-encoded text characters to their normal, - human-readable equivalents. - - All characters in the input text must be encodable by - *subencoding*. All special characters underlying the values in the - percent-encoding must be decodable as *subencoding*. If a - non-*subencoding*-valid string is passed, the original text is - returned with no changes applied. - - Only called by field-tailored variants, e.g., - :func:`_decode_path_part`, as every percent-encodable part of the - URL has characters which should not be percent decoded. - - >>> _percent_decode(u'abc%20def') - u'abc def' - - Args: - text (unicode): Text with percent-encoding present. - normalize_case (bool): Whether undecoded percent segments, such - as encoded delimiters, should be uppercased, per RFC 3986 - Section 2.1. See :func:`_decode_path_part` for an example. - subencoding (unicode): The name of the encoding underlying the - percent-encoding. Pass `False` to get back raw bytes. - raise_subencoding_exc (bool): Whether an error in decoding the bytes - underlying the percent-decoding should be raised. - - Returns: - unicode: The percent-decoded version of *text*, decoded by - *subencoding*, unless `subencoding=False` which returns bytes. - - """ - try: - quoted_bytes = text.encode('utf-8' if subencoding is False else subencoding) - except UnicodeEncodeError: - return text - - bits = quoted_bytes.split(b'%') - if len(bits) == 1: - return text - - res = [bits[0]] - append = res.append - - for item in bits[1:]: - hexpair, rest = item[:2], item[2:] - try: - append(_decode_map[hexpair]) - append(rest) - except KeyError: - pair_is_hex = hexpair in _HEX_CHAR_MAP - if pair_is_hex or not encode_stray_percents: - append(b'%') - else: - # if it's undecodable, treat as a real percent sign, - # which is reserved (because it wasn't in the - # context-aware _decode_map passed in), and should - # stay in an encoded state. - append(b'%25') - if normalize_case and pair_is_hex: - append(hexpair.upper()) - append(rest) - else: - append(item) - - unquoted_bytes = b''.join(res) - - if subencoding is False: - return unquoted_bytes - try: - return unquoted_bytes.decode(subencoding) - except UnicodeDecodeError: - if raise_subencoding_exc: - raise - return text - - -def _decode_host(host): - """Decode a host from ASCII-encodable text to IDNA-decoded text. If - the host text is not ASCII, it is returned unchanged, as it is - presumed that it is already IDNA-decoded. - - Some technical details: _decode_host is built on top of the "idna" - package, which has some quirks: - - Capital letters are not valid IDNA2008. The idna package will - raise an exception like this on capital letters: - - > idna.core.InvalidCodepoint: Codepoint U+004B at position 1 ... not allowed - - However, if a segment of a host (i.e., something in - url.host.split('.')) is already ASCII, idna doesn't perform its - usual checks. In fact, for capital letters it automatically - lowercases them. - - This check and some other functionality can be bypassed by passing - uts46=True to idna.encode/decode. This allows a more permissive and - convenient interface. So far it seems like the balanced approach. - - Example output (from idna==2.6): - - >> idna.encode(u'mahmöud.io') - 'xn--mahmud-zxa.io' - >> idna.encode(u'Mahmöud.io') - Traceback (most recent call last): - File "", line 1, in - File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 355, in encode - result.append(alabel(label)) - File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 276, in alabel - check_label(label) - File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 253, in check_label - raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label))) - idna.core.InvalidCodepoint: Codepoint U+004D at position 1 of u'Mahm\xf6ud' not allowed - >> idna.encode(u'Mahmoud.io') - 'Mahmoud.io' - - # Similar behavior for decodes below - >> idna.decode(u'Mahmoud.io') - u'mahmoud.io - >> idna.decode(u'Méhmoud.io', uts46=True) - u'm\xe9hmoud.io' - """ - if not host: - return u'' - try: - host_bytes = host.encode("ascii") - except UnicodeEncodeError: - host_text = host - else: - try: - host_text = idna_decode(host_bytes, uts46=True) - except ValueError: - # only reached on "narrow" (UCS-2) Python builds <3.4, see #7 - # NOTE: not going to raise here, because there's no - # ambiguity in the IDNA, and the host is still - # technically usable - host_text = host - return host_text - - -def _resolve_dot_segments(path): - """Normalize the URL path by resolving segments of '.' and '..'. For - more details, see `RFC 3986 section 5.2.4, Remove Dot Segments`_. - - Args: - path (list): path segments in string form - - Returns: - list: a new list of path segments with the '.' and '..' elements - removed and resolved. - - .. _RFC 3986 section 5.2.4, Remove Dot Segments: https://tools.ietf.org/html/rfc3986#section-5.2.4 - """ - segs = [] - - for seg in path: - if seg == u'.': - pass - elif seg == u'..': - if segs: - segs.pop() - else: - segs.append(seg) - - if list(path[-1:]) in ([u'.'], [u'..']): - segs.append(u'') - - return segs - - -def parse_host(host): - """Parse the host into a tuple of ``(family, host)``, where family - is the appropriate :mod:`socket` module constant when the host is - an IP address. Family is ``None`` when the host is not an IP. - - Will raise :class:`URLParseError` on invalid IPv6 constants. - - Returns: - tuple: family (socket constant or None), host (string) - - >>> parse_host('googlewebsite.com') == (None, 'googlewebsite.com') - True - >>> parse_host('::1') == (socket.AF_INET6, '::1') - True - >>> parse_host('192.168.1.1') == (socket.AF_INET, '192.168.1.1') - True - """ - if not host: - return None, u'' - if u':' in host: - try: - inet_pton(socket.AF_INET6, host) - except socket.error as se: - raise URLParseError('invalid IPv6 host: %r (%r)' % (host, se)) - except UnicodeEncodeError: - pass # TODO: this can't be a real host right? - else: - family = socket.AF_INET6 - return family, host - try: - inet_pton(socket.AF_INET, host) - except (socket.error, UnicodeEncodeError): - family = None # not an IP - else: - family = socket.AF_INET - return family, host - - -class URL(object): - """From blogs to billboards, URLs are so common, that it's easy to - overlook their complexity and power. With hyperlink's - :class:`URL` type, working with URLs doesn't have to be hard. - - URLs are made of many parts. Most of these parts are officially - named in `RFC 3986`_ and this diagram may prove handy in identifying - them:: - - foo://user:pass@example.com:8042/over/there?name=ferret#nose - \_/ \_______/ \_________/ \__/\_________/ \_________/ \__/ - | | | | | | | - scheme userinfo host port path query fragment - - While :meth:`~URL.from_text` is used for parsing whole URLs, the - :class:`URL` constructor builds a URL from the individual - components, like so:: - - >>> from hyperlink import URL - >>> url = URL(scheme=u'https', host=u'example.com', path=[u'hello', u'world']) - >>> print(url.to_text()) - https://example.com/hello/world - - The constructor runs basic type checks. All strings are expected - to be decoded (:class:`unicode` in Python 2). All arguments are - optional, defaulting to appropriately empty values. A full list of - constructor arguments is below. - - Args: - scheme (unicode): The text name of the scheme. - host (unicode): The host portion of the network location - port (int): The port part of the network location. If - ``None`` or no port is passed, the port will default to - the default port of the scheme, if it is known. See the - ``SCHEME_PORT_MAP`` and :func:`register_default_port` - for more info. - path (tuple): A tuple of strings representing the - slash-separated parts of the path. - query (tuple): The query parameters, as a dictionary or - as an iterable of key-value pairs. - fragment (unicode): The fragment part of the URL. - rooted (bool): Whether or not the path begins with a slash. - userinfo (unicode): The username or colon-separated - username:password pair. - uses_netloc (bool): Indicates whether two slashes appear - between the scheme and the host (``http://eg.com`` vs - ``mailto:e@g.com``). Set automatically based on scheme. - - All of these parts are also exposed as read-only attributes of - URL instances, along with several useful methods. - - .. _RFC 3986: https://tools.ietf.org/html/rfc3986 - .. _RFC 3987: https://tools.ietf.org/html/rfc3987 - """ - - def __init__(self, scheme=None, host=None, path=(), query=(), fragment=u'', - port=None, rooted=None, userinfo=u'', uses_netloc=None): - if host is not None and scheme is None: - scheme = u'http' # TODO: why - if port is None: - port = SCHEME_PORT_MAP.get(scheme) - if host and query and not path: - # per RFC 3986 6.2.3, "a URI that uses the generic syntax - # for authority with an empty path should be normalized to - # a path of '/'." - path = (u'',) - - # Now that we're done detecting whether they were passed, we can set - # them to their defaults: - if scheme is None: - scheme = u'' - if host is None: - host = u'' - if rooted is None: - rooted = bool(host) - - # Set attributes. - self._scheme = _textcheck("scheme", scheme) - if self._scheme: - if not _SCHEME_RE.match(self._scheme): - raise ValueError('invalid scheme: %r. Only alphanumeric, "+",' - ' "-", and "." allowed. Did you meant to call' - ' %s.from_text()?' - % (self._scheme, self.__class__.__name__)) - - _, self._host = parse_host(_textcheck('host', host, '/?#@')) - if isinstance(path, unicode): - raise TypeError("expected iterable of text for path, not: %r" - % (path,)) - self._path = tuple((_textcheck("path segment", segment, '/?#') - for segment in path)) - self._query = tuple( - (_textcheck("query parameter name", k, '&=#'), - _textcheck("query parameter value", v, '&#', nullable=True)) - for k, v in iter_pairs(query)) - self._fragment = _textcheck("fragment", fragment) - self._port = _typecheck("port", port, int, NoneType) - self._rooted = _typecheck("rooted", rooted, bool) - self._userinfo = _textcheck("userinfo", userinfo, '/?#@') - - uses_netloc = scheme_uses_netloc(self._scheme, uses_netloc) - self._uses_netloc = _typecheck("uses_netloc", - uses_netloc, bool, NoneType) - - return - - def get_decoded_url(self, lazy=False): - try: - return self._decoded_url - except AttributeError: - self._decoded_url = DecodedURL(self, lazy=lazy) - return self._decoded_url - - @property - def scheme(self): - """The scheme is a string, and the first part of an absolute URL, the - part before the first colon, and the part which defines the - semantics of the rest of the URL. Examples include "http", - "https", "ssh", "file", "mailto", and many others. See - :func:`~hyperlink.register_scheme()` for more info. - """ - return self._scheme - - @property - def host(self): - """The host is a string, and the second standard part of an absolute - URL. When present, a valid host must be a domain name, or an - IP (v4 or v6). It occurs before the first slash, or the second - colon, if a :attr:`~hyperlink.URL.port` is provided. - """ - return self._host - - @property - def port(self): - """The port is an integer that is commonly used in connecting to the - :attr:`host`, and almost never appears without it. - - When not present in the original URL, this attribute defaults - to the scheme's default port. If the scheme's default port is - not known, and the port is not provided, this attribute will - be set to None. - - >>> URL.from_text(u'http://example.com/pa/th').port - 80 - >>> URL.from_text(u'foo://example.com/pa/th').port - >>> URL.from_text(u'foo://example.com:8042/pa/th').port - 8042 - - .. note:: - - Per the standard, when the port is the same as the schemes - default port, it will be omitted in the text URL. - - """ - return self._port - - @property - def path(self): - """A tuple of strings, created by splitting the slash-separated - hierarchical path. Started by the first slash after the host, - terminated by a "?", which indicates the start of the - :attr:`~hyperlink.URL.query` string. - """ - return self._path - - @property - def query(self): - """Tuple of pairs, created by splitting the ampersand-separated - mapping of keys and optional values representing - non-hierarchical data used to identify the resource. Keys are - always strings. Values are strings when present, or None when - missing. - - For more operations on the mapping, see - :meth:`~hyperlink.URL.get()`, :meth:`~hyperlink.URL.add()`, - :meth:`~hyperlink.URL.set()`, and - :meth:`~hyperlink.URL.delete()`. - """ - return self._query - - @property - def fragment(self): - """A string, the last part of the URL, indicated by the first "#" - after the :attr:`~hyperlink.URL.path` or - :attr:`~hyperlink.URL.query`. Enables indirect identification - of a secondary resource, like an anchor within an HTML page. - - """ - return self._fragment - - @property - def rooted(self): - """Whether or not the path starts with a forward slash (``/``). - - This is taken from the terminology in the BNF grammar, - specifically the "path-rootless", rule, since "absolute path" - and "absolute URI" are somewhat ambiguous. :attr:`path` does - not contain the implicit prefixed ``"/"`` since that is - somewhat awkward to work with. - - """ - return self._rooted - - @property - def userinfo(self): - """The colon-separated string forming the username-password - combination. - """ - return self._userinfo - - @property - def uses_netloc(self): - """ - """ - return self._uses_netloc - - @property - def user(self): - """ - The user portion of :attr:`~hyperlink.URL.userinfo`. - """ - return self.userinfo.split(u':')[0] - - def authority(self, with_password=False, **kw): - """Compute and return the appropriate host/port/userinfo combination. - - >>> url = URL.from_text(u'http://user:pass@localhost:8080/a/b?x=y') - >>> url.authority() - u'user:@localhost:8080' - >>> url.authority(with_password=True) - u'user:pass@localhost:8080' - - Args: - with_password (bool): Whether the return value of this - method include the password in the URL, if it is - set. Defaults to False. - - Returns: - str: The authority (network location and user information) portion - of the URL. - """ - # first, a bit of twisted compat - with_password = kw.pop('includeSecrets', with_password) - if kw: - raise TypeError('got unexpected keyword arguments: %r' % kw.keys()) - host = self.host - if ':' in host: - hostport = ['[' + host + ']'] - else: - hostport = [self.host] - if self.port != SCHEME_PORT_MAP.get(self.scheme): - hostport.append(unicode(self.port)) - authority = [] - if self.userinfo: - userinfo = self.userinfo - if not with_password and u":" in userinfo: - userinfo = userinfo[:userinfo.index(u":") + 1] - authority.append(userinfo) - authority.append(u":".join(hostport)) - return u"@".join(authority) - - def __eq__(self, other): - if not isinstance(other, self.__class__): - return NotImplemented - for attr in ['scheme', 'userinfo', 'host', 'query', - 'fragment', 'port', 'uses_netloc']: - if getattr(self, attr) != getattr(other, attr): - return False - if self.path == other.path or (self.path in _ROOT_PATHS - and other.path in _ROOT_PATHS): - return True - return False - - def __ne__(self, other): - if not isinstance(other, self.__class__): - return NotImplemented - return not self.__eq__(other) - - def __hash__(self): - return hash((self.__class__, self.scheme, self.userinfo, self.host, - self.path, self.query, self.fragment, self.port, - self.rooted, self.uses_netloc)) - - @property - def absolute(self): - """Whether or not the URL is "absolute". Absolute URLs are complete - enough to resolve to a network resource without being relative - to a base URI. - - >>> URL.from_text(u'http://wikipedia.org/').absolute - True - >>> URL.from_text(u'?a=b&c=d').absolute - False - - Absolute URLs must have both a scheme and a host set. - """ - return bool(self.scheme and self.host) - - def replace(self, scheme=_UNSET, host=_UNSET, path=_UNSET, query=_UNSET, - fragment=_UNSET, port=_UNSET, rooted=_UNSET, userinfo=_UNSET, - uses_netloc=_UNSET): - """:class:`URL` objects are immutable, which means that attributes - are designed to be set only once, at construction. Instead of - modifying an existing URL, one simply creates a copy with the - desired changes. - - If any of the following arguments is omitted, it defaults to - the value on the current URL. - - Args: - scheme (unicode): The text name of the scheme. - host (unicode): The host portion of the network location - port (int): The port part of the network location. - path (tuple): A tuple of strings representing the - slash-separated parts of the path. - query (tuple): The query parameters, as a tuple of - key-value pairs. - query (tuple): The query parameters, as a dictionary or - as an iterable of key-value pairs. - fragment (unicode): The fragment part of the URL. - rooted (bool): Whether or not the path begins with a slash. - userinfo (unicode): The username or colon-separated - username:password pair. - uses_netloc (bool): Indicates whether two slashes appear - between the scheme and the host (``http://eg.com`` vs - ``mailto:e@g.com``) - - Returns: - URL: a copy of the current :class:`URL`, with new values for - parameters passed. - - """ - return self.__class__( - scheme=_optional(scheme, self.scheme), - host=_optional(host, self.host), - path=_optional(path, self.path), - query=_optional(query, self.query), - fragment=_optional(fragment, self.fragment), - port=_optional(port, self.port), - rooted=_optional(rooted, self.rooted), - userinfo=_optional(userinfo, self.userinfo), - uses_netloc=_optional(uses_netloc, self.uses_netloc) - ) - - @classmethod - def from_text(cls, text): - """Whereas the :class:`URL` constructor is useful for constructing - URLs from parts, :meth:`~URL.from_text` supports parsing whole - URLs from their string form:: - - >>> URL.from_text(u'http://example.com') - URL.from_text(u'http://example.com') - >>> URL.from_text(u'?a=b&x=y') - URL.from_text(u'?a=b&x=y') - - As you can see above, it's also used as the :func:`repr` of - :class:`URL` objects. The natural counterpart to - :func:`~URL.to_text()`. This method only accepts *text*, so be - sure to decode those bytestrings. - - Args: - text (unicode): A valid URL string. - - Returns: - URL: The structured object version of the parsed string. - - .. note:: - - Somewhat unexpectedly, URLs are a far more permissive - format than most would assume. Many strings which don't - look like URLs are still valid URLs. As a result, this - method only raises :class:`URLParseError` on invalid port - and IPv6 values in the host portion of the URL. - - """ - um = _URL_RE.match(_textcheck('text', text)) - try: - gs = um.groupdict() - except AttributeError: - raise URLParseError('could not parse url: %r' % text) - - au_text = gs['authority'] or u'' - au_m = _AUTHORITY_RE.match(au_text) - try: - au_gs = au_m.groupdict() - except AttributeError: - raise URLParseError('invalid authority %r in url: %r' - % (au_text, text)) - if au_gs['bad_host']: - raise URLParseError('invalid host %r in url: %r' - % (au_gs['bad_host'], text)) - - userinfo = au_gs['userinfo'] or u'' - - host = au_gs['ipv6_host'] or au_gs['plain_host'] - port = au_gs['port'] - if port is not None: - try: - port = int(port) - except ValueError: - if not port: # TODO: excessive? - raise URLParseError('port must not be empty: %r' % au_text) - raise URLParseError('expected integer for port, not %r' % port) - - scheme = gs['scheme'] or u'' - fragment = gs['fragment'] or u'' - uses_netloc = bool(gs['_netloc_sep']) - - if gs['path']: - path = gs['path'].split(u"/") - if not path[0]: - path.pop(0) - rooted = True - else: - rooted = False - else: - path = () - rooted = bool(au_text) - if gs['query']: - query = ((qe.split(u"=", 1) if u'=' in qe else (qe, None)) - for qe in gs['query'].split(u"&")) - else: - query = () - return cls(scheme, host, path, query, fragment, port, - rooted, userinfo, uses_netloc) - - def normalize(self, scheme=True, host=True, path=True, query=True, - fragment=True, userinfo=True, percents=True): - """Return a new URL object with several standard normalizations - applied: - - * Decode unreserved characters (`RFC 3986 2.3`_) - * Uppercase remaining percent-encoded octets (`RFC 3986 2.1`_) - * Convert scheme and host casing to lowercase (`RFC 3986 3.2.2`_) - * Resolve any "." and ".." references in the path (`RFC 3986 6.2.2.3`_) - * Ensure an ending slash on URLs with an empty path (`RFC 3986 6.2.3`_) - * Encode any stray percent signs (`%`) in percent-encoded - fields (path, query, fragment, userinfo) (`RFC 3986 2.4`_) - - All are applied by default, but normalizations can be disabled - per-part by passing `False` for that part's corresponding - name. - - Args: - scheme (bool): Convert the scheme to lowercase - host (bool): Convert the host to lowercase - path (bool): Normalize the path (see above for details) - query (bool): Normalize the query string - fragment (bool): Normalize the fragment - userinfo (bool): Normalize the userinfo - percents (bool): Encode isolated percent signs - for any percent-encoded fields which are being - normalized (defaults to True). - - >>> url = URL.from_text(u'Http://example.COM/a/../b/./c%2f?%61%') - >>> print(url.normalize().to_text()) - http://example.com/b/c%2F?a%25 - - .. _RFC 3986 3.2.2: https://tools.ietf.org/html/rfc3986#section-3.2.2 - .. _RFC 3986 2.3: https://tools.ietf.org/html/rfc3986#section-2.3 - .. _RFC 3986 2.1: https://tools.ietf.org/html/rfc3986#section-2.1 - .. _RFC 3986 6.2.2.3: https://tools.ietf.org/html/rfc3986#section-6.2.2.3 - .. _RFC 3986 6.2.3: https://tools.ietf.org/html/rfc3986#section-6.2.3 - .. _RFC 3986 2.4: https://tools.ietf.org/html/rfc3986#section-2.4 - - """ - kw = {} - if scheme: - kw['scheme'] = self.scheme.lower() - if host: - kw['host'] = self.host.lower() - def _dec_unres(target): - return _decode_unreserved(target, normalize_case=True, - encode_stray_percents=percents) - if path: - if self.path: - kw['path'] = [_dec_unres(p) for p in _resolve_dot_segments(self.path)] - else: - kw['path'] = (u'',) - if query: - kw['query'] = [(_dec_unres(k), _dec_unres(v) if v else v) - for k, v in self.query] - if fragment: - kw['fragment'] = _dec_unres(self.fragment) - if userinfo: - kw['userinfo'] = u':'.join([_dec_unres(p) - for p in self.userinfo.split(':', 1)]) - - return self.replace(**kw) - - def child(self, *segments): - """Make a new :class:`URL` where the given path segments are a child - of this URL, preserving other parts of the URL, including the - query string and fragment. - - For example:: - - >>> url = URL.from_text(u'http://localhost/a/b?x=y') - >>> child_url = url.child(u"c", u"d") - >>> child_url.to_text() - u'http://localhost/a/b/c/d?x=y' - - Args: - segments (unicode): Additional parts to be joined and added to - the path, like :func:`os.path.join`. Special characters - in segments will be percent encoded. - - Returns: - URL: A copy of the current URL with the extra path segments. - - """ - if not segments: - return self - - segments = [_textcheck('path segment', s) for s in segments] - new_segs = _encode_path_parts(segments, joined=False, maximal=False) - new_path = self.path[:-1 if (self.path and self.path[-1] == u'') - else None] + new_segs - return self.replace(path=new_path) - - def sibling(self, segment): - """Make a new :class:`URL` with a single path segment that is a - sibling of this URL path. - - Args: - segment (unicode): A single path segment. - - Returns: - URL: A copy of the current URL with the last path segment - replaced by *segment*. Special characters such as - ``/?#`` will be percent encoded. - - """ - _textcheck('path segment', segment) - new_path = self.path[:-1] + (_encode_path_part(segment),) - return self.replace(path=new_path) - - def click(self, href=u''): - """Resolve the given URL relative to this URL. - - The resulting URI should match what a web browser would - generate if you visited the current URL and clicked on *href*. - - >>> url = URL.from_text(u'http://blog.hatnote.com/') - >>> url.click(u'/post/155074058790').to_text() - u'http://blog.hatnote.com/post/155074058790' - >>> url = URL.from_text(u'http://localhost/a/b/c/') - >>> url.click(u'../d/./e').to_text() - u'http://localhost/a/b/d/e' - - Args: - href (unicode): A string representing a clicked URL. - - Return: - URL: A copy of the current URL with navigation logic applied. - - For more information, see `RFC 3986 section 5`_. - - .. _RFC 3986 section 5: https://tools.ietf.org/html/rfc3986#section-5 - """ - if href: - if isinstance(href, URL): - clicked = href - else: - # TODO: This error message is not completely accurate, - # as URL objects are now also valid, but Twisted's - # test suite (wrongly) relies on this exact message. - _textcheck('relative URL', href) - clicked = URL.from_text(href) - if clicked.absolute: - return clicked - else: - clicked = self - - query = clicked.query - if clicked.scheme and not clicked.rooted: - # Schemes with relative paths are not well-defined. RFC 3986 calls - # them a "loophole in prior specifications" that should be avoided, - # or supported only for backwards compatibility. - raise NotImplementedError('absolute URI with rootless path: %r' - % (href,)) - else: - if clicked.rooted: - path = clicked.path - elif clicked.path: - path = self.path[:-1] + clicked.path - else: - path = self.path - if not query: - query = self.query - return self.replace(scheme=clicked.scheme or self.scheme, - host=clicked.host or self.host, - port=clicked.port or self.port, - path=_resolve_dot_segments(path), - query=query, - fragment=clicked.fragment) - - def to_uri(self): - u"""Make a new :class:`URL` instance with all non-ASCII characters - appropriately percent-encoded. This is useful to do in preparation - for sending a :class:`URL` over a network protocol. - - For example:: - - >>> URL.from_text(u'https://ايران.com/foo⇧bar/').to_uri() - URL.from_text(u'https://xn--mgba3a4fra.com/foo%E2%87%A7bar/') - - Returns: - URL: A new instance with its path segments, query parameters, and - hostname encoded, so that they are all in the standard - US-ASCII range. - """ - new_userinfo = u':'.join([_encode_userinfo_part(p) for p in - self.userinfo.split(':', 1)]) - new_path = _encode_path_parts(self.path, has_scheme=bool(self.scheme), - rooted=False, joined=False, maximal=True) - new_host = self.host if not self.host else idna_encode(self.host, uts46=True).decode("ascii") - return self.replace( - userinfo=new_userinfo, - host=new_host, - path=new_path, - query=tuple([(_encode_query_key(k, maximal=True), - _encode_query_value(v, maximal=True) - if v is not None else None) - for k, v in self.query]), - fragment=_encode_fragment_part(self.fragment, maximal=True) - ) - - def to_iri(self): - u"""Make a new :class:`URL` instance with all but a few reserved - characters decoded into human-readable format. - - Percent-encoded Unicode and IDNA-encoded hostnames are - decoded, like so:: - - >>> url = URL.from_text(u'https://xn--mgba3a4fra.example.com/foo%E2%87%A7bar/') - >>> print(url.to_iri().to_text()) - https://ايران.example.com/foo⇧bar/ - - .. note:: - - As a general Python issue, "narrow" (UCS-2) builds of - Python may not be able to fully decode certain URLs, and - the in those cases, this method will return a best-effort, - partially-decoded, URL which is still valid. This issue - does not affect any Python builds 3.4+. - - Returns: - URL: A new instance with its path segments, query parameters, and - hostname decoded for display purposes. - """ - new_userinfo = u':'.join([_decode_userinfo_part(p) for p in - self.userinfo.split(':', 1)]) - host_text = _decode_host(self.host) - - return self.replace(userinfo=new_userinfo, - host=host_text, - path=[_decode_path_part(segment) - for segment in self.path], - query=[(_decode_query_key(k), - _decode_query_value(v) - if v is not None else None) - for k, v in self.query], - fragment=_decode_fragment_part(self.fragment)) - - def to_text(self, with_password=False): - """Render this URL to its textual representation. - - By default, the URL text will *not* include a password, if one - is set. RFC 3986 considers using URLs to represent such - sensitive information as deprecated. Quoting from RFC 3986, - `section 3.2.1`: - - "Applications should not render as clear text any data after the - first colon (":") character found within a userinfo subcomponent - unless the data after the colon is the empty string (indicating no - password)." - - Args: - with_password (bool): Whether or not to include the - password in the URL text. Defaults to False. - - Returns: - str: The serialized textual representation of this URL, - such as ``u"http://example.com/some/path?some=query"``. - - The natural counterpart to :class:`URL.from_text()`. - - .. _section 3.2.1: https://tools.ietf.org/html/rfc3986#section-3.2.1 - """ - scheme = self.scheme - authority = self.authority(with_password) - path = _encode_path_parts(self.path, - rooted=self.rooted, - has_scheme=bool(scheme), - has_authority=bool(authority), - maximal=False) - query_parts = [] - for k, v in self.query: - if v is None: - query_parts.append(_encode_query_key(k, maximal=False)) - else: - query_parts.append(u'='.join((_encode_query_key(k, maximal=False), - _encode_query_value(v, maximal=False)))) - query_string = u'&'.join(query_parts) - - fragment = self.fragment - - parts = [] - _add = parts.append - if scheme: - _add(scheme) - _add(':') - if authority: - _add('//') - _add(authority) - elif (scheme and path[:2] != '//' and self.uses_netloc): - _add('//') - if path: - if scheme and authority and path[:1] != '/': - _add('/') # relpaths with abs authorities auto get '/' - _add(path) - if query_string: - _add('?') - _add(query_string) - if fragment: - _add('#') - _add(fragment) - return u''.join(parts) - - def __repr__(self): - """Convert this URL to an representation that shows all of its - constituent parts, as well as being a valid argument to - :func:`eval`. - """ - return '%s.from_text(%r)' % (self.__class__.__name__, self.to_text()) - - def _to_bytes(self): - """ - Allows for direct usage of URL objects with libraries like - requests, which automatically stringify URL parameters. See - issue #49. - """ - return self.to_uri().to_text().encode('ascii') - - if PY2: - __str__ = _to_bytes - __unicode__ = to_text - else: - __bytes__ = _to_bytes - __str__ = to_text - - # # Begin Twisted Compat Code - asURI = to_uri - asIRI = to_iri - - @classmethod - def fromText(cls, s): - return cls.from_text(s) - - def asText(self, includeSecrets=False): - return self.to_text(with_password=includeSecrets) - - def __dir__(self): - try: - ret = object.__dir__(self) - except AttributeError: - # object.__dir__ == AttributeError # pdw for py2 - ret = dir(self.__class__) + list(self.__dict__.keys()) - ret = sorted(set(ret) - set(['fromText', 'asURI', 'asIRI', 'asText'])) - return ret - - # # End Twisted Compat Code - - def add(self, name, value=None): - """Make a new :class:`URL` instance with a given query argument, - *name*, added to it with the value *value*, like so:: - - >>> URL.from_text(u'https://example.com/?x=y').add(u'x') - URL.from_text(u'https://example.com/?x=y&x') - >>> URL.from_text(u'https://example.com/?x=y').add(u'x', u'z') - URL.from_text(u'https://example.com/?x=y&x=z') - - Args: - name (unicode): The name of the query parameter to add. The - part before the ``=``. - value (unicode): The value of the query parameter to add. The - part after the ``=``. Defaults to ``None``, meaning no - value. - - Returns: - URL: A new :class:`URL` instance with the parameter added. - """ - return self.replace(query=self.query + ((name, value),)) - - def set(self, name, value=None): - """Make a new :class:`URL` instance with the query parameter *name* - set to *value*. All existing occurences, if any are replaced - by the single name-value pair. - - >>> URL.from_text(u'https://example.com/?x=y').set(u'x') - URL.from_text(u'https://example.com/?x') - >>> URL.from_text(u'https://example.com/?x=y').set(u'x', u'z') - URL.from_text(u'https://example.com/?x=z') - - Args: - name (unicode): The name of the query parameter to set. The - part before the ``=``. - value (unicode): The value of the query parameter to set. The - part after the ``=``. Defaults to ``None``, meaning no - value. - - Returns: - URL: A new :class:`URL` instance with the parameter set. - """ - # Preserve the original position of the query key in the list - q = [(k, v) for (k, v) in self.query if k != name] - idx = next((i for (i, (k, v)) in enumerate(self.query) - if k == name), -1) - q[idx:idx] = [(name, value)] - return self.replace(query=q) - - def get(self, name): - """Get a list of values for the given query parameter, *name*:: - - >>> url = URL.from_text(u'?x=1&x=2') - >>> url.get('x') - [u'1', u'2'] - >>> url.get('y') - [] - - If the given *name* is not set, an empty list is returned. A - list is always returned, and this method raises no exceptions. - - Args: - name (unicode): The name of the query parameter to get. - - Returns: - list: A list of all the values associated with the key, in - string form. - - """ - return [value for (key, value) in self.query if name == key] - - def remove(self, name, value=_UNSET, limit=None): - """Make a new :class:`URL` instance with occurrences of the query - parameter *name* removed, or, if *value* is set, parameters - matching *name* and *value*. No exception is raised if the - parameter is not already set. - - Args: - name (unicode): The name of the query parameter to remove. - value (unicode): Optional value to additionally filter - on. Setting this removes query parameters which match - both name and value. - limit (int): Optional maximum number of parameters to remove. - - Returns: - URL: A new :class:`URL` instance with the parameter removed. - """ - if limit is None: - if value is _UNSET: - nq = [(k, v) for (k, v) in self.query if k != name] - else: - nq = [(k, v) for (k, v) in self.query if not (k == name and v == value)] - else: - nq, removed_count = [], 0 - - for k, v in self.query: - if k == name and (value is _UNSET or v == value) and removed_count < limit: - removed_count += 1 # drop it - else: - nq.append((k, v)) # keep it - - return self.replace(query=nq) - - -EncodedURL = URL # An alias better describing what the URL really is - - -class DecodedURL(object): - """DecodedURL is a type meant to act as a higher-level interface to - the URL. It is the `unicode` to URL's `bytes`. `DecodedURL` has - almost exactly the same API as `URL`, but everything going in and - out is in its maximally decoded state. All percent decoding is - handled automatically. - - Where applicable, a UTF-8 encoding is presumed. Be advised that - some interactions can raise :exc:`UnicodeEncodeErrors` and - :exc:`UnicodeDecodeErrors`, just like when working with - bytestrings. Examples of such interactions include handling query - strings encoding binary data, and paths containing segments with - special characters encoded with codecs other than UTF-8. - - Args: - url (URL): A :class:`URL` object to wrap. - lazy (bool): Set to True to avoid pre-decode all parts of the - URL to check for validity. Defaults to False. - - """ - def __init__(self, url, lazy=False): - self._url = url - if not lazy: - # cache the following, while triggering any decoding - # issues with decodable fields - self.host, self.userinfo, self.path, self.query, self.fragment - return - - @classmethod - def from_text(cls, text, lazy=False): - """\ - Make a `DecodedURL` instance from any text string containing a URL. - - Args: - text (unicode): Text containing the URL - lazy (bool): Whether to pre-decode all parts of the URL to - check for validity. Defaults to True. - """ - _url = URL.from_text(text) - return cls(_url, lazy=lazy) - - @property - def encoded_url(self): - """Access the underlying :class:`URL` object, which has any special - characters encoded. - """ - return self._url - - def to_text(self, *a, **kw): - "Passthrough to :meth:`~hyperlink.URL.to_text()`" - return self._url.to_text(*a, **kw) - - def to_uri(self, *a, **kw): - "Passthrough to :meth:`~hyperlink.URL.to_uri()`" - return self._url.to_uri(*a, **kw) - - def to_iri(self, *a, **kw): - "Passthrough to :meth:`~hyperlink.URL.to_iri()`" - return self._url.to_iri(*a, **kw) - - def click(self, href=u''): - "Return a new DecodedURL wrapping the result of :meth:`~hyperlink.URL.click()`" - if isinstance(href, DecodedURL): - href = href._url - return self.__class__(self._url.click(href=href)) - - def sibling(self, segment): - """Automatically encode any reserved characters in *segment* and - return a new `DecodedURL` wrapping the result of - :meth:`~hyperlink.URL.sibling()` - """ - return self.__class__(self._url.sibling(_encode_reserved(segment))) - - def child(self, *segments): - """Automatically encode any reserved characters in *segments* and - return a new `DecodedURL` wrapping the result of - :meth:`~hyperlink.URL.child()`. - """ - if not segments: - return self - new_segs = [_encode_reserved(s) for s in segments] - return self.__class__(self._url.child(*new_segs)) - - def normalize(self, *a, **kw): - "Return a new `DecodedURL` wrapping the result of :meth:`~hyperlink.URL.normalize()`" - return self.__class__(self._url.normalize(*a, **kw)) - - @property - def absolute(self): - return self._url.absolute - - @property - def scheme(self): - return self._url.scheme - - @property - def host(self): - return _decode_host(self._url.host) - - @property - def port(self): - return self._url.port - - @property - def rooted(self): - return self._url.rooted - - @property - def path(self): - try: - return self._path - except AttributeError: - pass - self._path = tuple([_percent_decode(p, raise_subencoding_exc=True) - for p in self._url.path]) - return self._path - - @property - def query(self): - try: - return self._query - except AttributeError: - pass - _q = [tuple(_percent_decode(x, raise_subencoding_exc=True) - if x is not None else None - for x in (k, v)) - for k, v in self._url.query] - self._query = tuple(_q) - return self._query - - @property - def fragment(self): - try: - return self._fragment - except AttributeError: - pass - frag = self._url.fragment - self._fragment = _percent_decode(frag, raise_subencoding_exc=True) - return self._fragment - - @property - def userinfo(self): - try: - return self._userinfo - except AttributeError: - pass - self._userinfo = tuple([_percent_decode(p, raise_subencoding_exc=True) - for p in self._url.userinfo.split(':', 1)]) - return self._userinfo - - @property - def user(self): - return self.userinfo[0] - - @property - def uses_netloc(self): - return self._url.uses_netloc - - def replace(self, scheme=_UNSET, host=_UNSET, path=_UNSET, query=_UNSET, - fragment=_UNSET, port=_UNSET, rooted=_UNSET, userinfo=_UNSET, - uses_netloc=_UNSET): - """While the signature is the same, this `replace()` differs a little - from URL.replace. For instance, it accepts userinfo as a - tuple, not as a string, handling the case of having a username - containing a `:`. As with the rest of the methods on - DecodedURL, if you pass a reserved character, it will be - automatically encoded instead of an error being raised. - - """ - if path is not _UNSET: - path = [_encode_reserved(p) for p in path] - if query is not _UNSET: - query = [[_encode_reserved(x) - if x is not None else None - for x in (k, v)] - for k, v in iter_pairs(query)] - if userinfo is not _UNSET: - if len(userinfo) > 2: - raise ValueError('userinfo expected sequence of ["user"] or' - ' ["user", "password"], got %r' % userinfo) - userinfo = u':'.join([_encode_reserved(p) for p in userinfo]) - new_url = self._url.replace(scheme=scheme, - host=host, - path=path, - query=query, - fragment=fragment, - port=port, - rooted=rooted, - userinfo=userinfo, - uses_netloc=uses_netloc) - return self.__class__(url=new_url) - - def get(self, name): - "Get the value of all query parameters whose name matches *name*" - return [v for (k, v) in self.query if name == k] - - def add(self, name, value=None): - "Return a new DecodedURL with the query parameter *name* and *value* added." - return self.replace(query=self.query + ((name, value),)) - - def set(self, name, value=None): - "Return a new DecodedURL with query parameter *name* set to *value*" - query = self.query - q = [(k, v) for (k, v) in query if k != name] - idx = next((i for (i, (k, v)) in enumerate(query) if k == name), -1) - q[idx:idx] = [(name, value)] - return self.replace(query=q) - - def remove(self, name, value=_UNSET, limit=None): - """Return a new DecodedURL with query parameter *name* removed. - - Optionally also filter for *value*, as well as cap the number - of parameters removed with *limit*. - """ - if limit is None: - if value is _UNSET: - nq = [(k, v) for (k, v) in self.query if k != name] - else: - nq = [(k, v) for (k, v) in self.query if not (k == name and v == value)] - else: - nq, removed_count = [], 0 - for k, v in self.query: - if k == name and (value is _UNSET or v == value) and removed_count < limit: - removed_count += 1 # drop it - else: - nq.append((k, v)) # keep it - - return self.replace(query=nq) - - def __repr__(self): - cn = self.__class__.__name__ - return '%s(url=%r)' % (cn, self._url) - - def __str__(self): - # TODO: the underlying URL's __str__ needs to change to make - # this work as the URL, see #55 - return str(self._url) - - def __eq__(self, other): - if not isinstance(other, self.__class__): - return NotImplemented - return self.normalize().to_uri() == other.normalize().to_uri() - - def __ne__(self, other): - if not isinstance(other, self.__class__): - return NotImplemented - return not self.__eq__(other) - - def __hash__(self): - return hash((self.__class__, self.scheme, self.userinfo, self.host, - self.path, self.query, self.fragment, self.port, - self.rooted, self.uses_netloc)) - - # # Begin Twisted Compat Code - asURI = to_uri - asIRI = to_iri - - @classmethod - def fromText(cls, s, lazy=False): - return cls.from_text(s, lazy=lazy) - - def asText(self, includeSecrets=False): - return self.to_text(with_password=includeSecrets) - - def __dir__(self): - try: - ret = object.__dir__(self) - except AttributeError: - # object.__dir__ == AttributeError # pdw for py2 - ret = dir(self.__class__) + list(self.__dict__.keys()) - ret = sorted(set(ret) - set(['fromText', 'asURI', 'asIRI', 'asText'])) - return ret - - # # End Twisted Compat Code - - -def parse(url, decoded=True, lazy=False): - """Automatically turn text into a structured URL object. - - Args: - - decoded (bool): Whether or not to return a :class:`DecodedURL`, - which automatically handles all - encoding/decoding/quoting/unquoting for all the various - accessors of parts of the URL, or an :class:`EncodedURL`, - which has the same API, but requires handling of special - characters for different parts of the URL. - - lazy (bool): In the case of `decoded=True`, this controls - whether the URL is decoded immediately or as accessed. The - default, `lazy=False`, checks all encoded parts of the URL - for decodability. - """ - enc_url = EncodedURL.from_text(url) - if not decoded: - return enc_url - dec_url = DecodedURL(enc_url, lazy=lazy) - return dec_url diff --git a/hyperlink/test/__init__.py b/hyperlink/test/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/hyperlink/test/common.py b/hyperlink/test/common.py deleted file mode 100644 index 28eba527..00000000 --- a/hyperlink/test/common.py +++ /dev/null @@ -1,58 +0,0 @@ - - -from unittest import TestCase - - -class HyperlinkTestCase(TestCase): - """This type mostly exists to provide a backwards-compatible - assertRaises method for Python 2.6 testing. - """ - def assertRaises(self, excClass, callableObj=None, *args, **kwargs): - """Fail unless an exception of class excClass is raised - by callableObj when invoked with arguments args and keyword - arguments kwargs. If a different type of exception is - raised, it will not be caught, and the test case will be - deemed to have suffered an error, exactly as for an - unexpected exception. - - If called with callableObj omitted or None, will return a - context object used like this:: - - with self.assertRaises(SomeException): - do_something() - - The context manager keeps a reference to the exception as - the 'exception' attribute. This allows you to inspect the - exception after the assertion:: - - with self.assertRaises(SomeException) as cm: - do_something() - the_exception = cm.exception - self.assertEqual(the_exception.error_code, 3) - """ - context = _AssertRaisesContext(excClass, self) - if callableObj is None: - return context - with context: - callableObj(*args, **kwargs) - - -class _AssertRaisesContext(object): - "A context manager used to implement HyperlinkTestCase.assertRaises." - - def __init__(self, expected, test_case): - self.expected = expected - self.failureException = test_case.failureException - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, tb): - if exc_type is None: - exc_name = self.expected.__name__ - raise self.failureException("%s not raised" % (exc_name,)) - if not issubclass(exc_type, self.expected): - # let unexpected exceptions pass through - return False - self.exception = exc_value # store for later retrieval - return True diff --git a/hyperlink/test/test_decoded_url.py b/hyperlink/test/test_decoded_url.py deleted file mode 100644 index 4e6f8b97..00000000 --- a/hyperlink/test/test_decoded_url.py +++ /dev/null @@ -1,180 +0,0 @@ -# -*- coding: utf-8 -*- - -from __future__ import unicode_literals - -from .. import DecodedURL -from .._url import _percent_decode -from .common import HyperlinkTestCase - -BASIC_URL = 'http://example.com/#' -TOTAL_URL = "https://%75%73%65%72:%00%00%00%00@xn--bcher-kva.ch:8080/a/nice%20nice/./path/?zot=23%25&zut#frég" - - -class TestURL(HyperlinkTestCase): - - def test_durl_basic(self): - bdurl = DecodedURL.from_text(BASIC_URL) - assert bdurl.scheme == 'http' - assert bdurl.host == 'example.com' - assert bdurl.port == 80 - assert bdurl.path == ('',) - assert bdurl.fragment == '' - - durl = DecodedURL.from_text(TOTAL_URL) - - assert durl.scheme == 'https' - assert durl.host == 'bücher.ch' - assert durl.port == 8080 - assert durl.path == ('a', 'nice nice', '.', 'path', '') - assert durl.fragment == 'frég' - assert durl.get('zot') == ['23%'] - - assert durl.user == 'user' - assert durl.userinfo == ('user', '\0\0\0\0') - - def test_passthroughs(self): - # just basic tests for the methods that more or less pass straight - # through to the underlying URL - - durl = DecodedURL.from_text(TOTAL_URL) - assert durl.sibling('te%t').path[-1] == 'te%t' - assert durl.child('../test2%').path[-1] == '../test2%' - assert durl.child() == durl - assert durl.child() is durl - assert durl.click('/').path[-1] == '' - assert durl.user == 'user' - - assert '.' in durl.path - assert '.' not in durl.normalize().path - - assert durl.to_uri().fragment == 'fr%C3%A9g' - assert ' ' in durl.to_iri().path[1] - - assert durl.to_text(with_password=True) == TOTAL_URL - - assert durl.absolute - assert durl.rooted - - assert durl == durl.encoded_url.get_decoded_url() - - durl2 = DecodedURL.from_text(TOTAL_URL, lazy=True) - assert durl2 == durl2.encoded_url.get_decoded_url(lazy=True) - - assert str(DecodedURL.from_text(BASIC_URL).child(' ')) == 'http://example.com/%20' - - assert not (durl == 1) - assert durl != 1 - - def test_repr(self): - durl = DecodedURL.from_text(TOTAL_URL) - assert repr(durl) == 'DecodedURL(url=' + repr(durl._url) + ')' - - def test_query_manipulation(self): - durl = DecodedURL.from_text(TOTAL_URL) - - assert durl.get('zot') == ['23%'] - durl = durl.add(' ', 'space') - assert durl.get(' ') == ['space'] - durl = durl.set(' ', 'spa%ed') - assert durl.get(' ') == ['spa%ed'] - - durl = DecodedURL(url=durl.to_uri()) - assert durl.get(' ') == ['spa%ed'] - durl = durl.remove(' ') - assert durl.get(' ') == [] - - durl = DecodedURL.from_text('/?%61rg=b&arg=c') - assert durl.get('arg') == ['b', 'c'] - - assert durl.set('arg', 'd').get('arg') == ['d'] - - durl = DecodedURL.from_text(u"https://example.com/a/b/?fóó=1&bar=2&fóó=3") - assert durl.remove("fóó") == DecodedURL.from_text("https://example.com/a/b/?bar=2") - assert durl.remove("fóó", value="1") == DecodedURL.from_text("https://example.com/a/b/?bar=2&fóó=3") - assert durl.remove("fóó", limit=1) == DecodedURL.from_text("https://example.com/a/b/?bar=2&fóó=3") - assert durl.remove("fóó", value="1", limit=0) == DecodedURL.from_text("https://example.com/a/b/?fóó=1&bar=2&fóó=3") - - def test_equality_and_hashability(self): - durl = DecodedURL.from_text(TOTAL_URL) - durl2 = DecodedURL.from_text(TOTAL_URL) - burl = DecodedURL.from_text(BASIC_URL) - durl_uri = durl.to_uri() - - assert durl == durl - assert durl == durl2 - assert durl != burl - assert durl != None - assert durl != durl._url - - durl_map = {} - durl_map[durl] = durl - durl_map[durl2] = durl2 - - assert len(durl_map) == 1 - - durl_map[burl] = burl - - assert len(durl_map) == 2 - - durl_map[durl_uri] = durl_uri - - assert len(durl_map) == 3 - - def test_replace_roundtrip(self): - durl = DecodedURL.from_text(TOTAL_URL) - - durl2 = durl.replace(scheme=durl.scheme, - host=durl.host, - path=durl.path, - query=durl.query, - fragment=durl.fragment, - port=durl.port, - rooted=durl.rooted, - userinfo=durl.userinfo, - uses_netloc=durl.uses_netloc) - - assert durl == durl2 - - def test_replace_userinfo(self): - durl = DecodedURL.from_text(TOTAL_URL) - with self.assertRaises(ValueError): - durl.replace(userinfo=['user', 'pw', 'thiswillcauseafailure']) - return - - def test_twisted_compat(self): - durl = DecodedURL.from_text(TOTAL_URL) - - assert durl == DecodedURL.fromText(TOTAL_URL) - assert 'to_text' in dir(durl) - assert 'asText' not in dir(durl) - assert durl.to_text() == durl.asText() - - def test_percent_decode_bytes(self): - assert _percent_decode('%00', subencoding=False) == b'\0' - - def test_percent_decode_mixed(self): - # See https://github.com/python-hyper/hyperlink/pull/59 for a - # nice discussion of the possibilities - assert _percent_decode('abcdé%C3%A9éfg') == 'abcdéééfg' - - # still allow percent encoding in the case of an error - assert _percent_decode('abcdé%C3éfg') == 'abcdé%C3éfg' - - # ...unless explicitly told otherwise - with self.assertRaises(UnicodeDecodeError): - _percent_decode('abcdé%C3éfg', raise_subencoding_exc=True) - - # check that getting raw bytes works ok - assert _percent_decode('a%00b', subencoding=False) == b'a\x00b' - - # when not encodable as subencoding - assert _percent_decode('é%25é', subencoding='ascii') == 'é%25é' - - def test_click_decoded_url(self): - durl = DecodedURL.from_text(TOTAL_URL) - durl_dest = DecodedURL.from_text('/tëst') - - clicked = durl.click(durl_dest) - assert clicked.host == durl.host - assert clicked.path == durl_dest.path - assert clicked.path == ('tëst',) diff --git a/hyperlink/test/test_scheme_registration.py b/hyperlink/test/test_scheme_registration.py deleted file mode 100644 index d344353c..00000000 --- a/hyperlink/test/test_scheme_registration.py +++ /dev/null @@ -1,64 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - - -from .. import _url -from .common import HyperlinkTestCase -from .._url import register_scheme, URL - - -class TestSchemeRegistration(HyperlinkTestCase): - - def setUp(self): - self._orig_scheme_port_map = dict(_url.SCHEME_PORT_MAP) - self._orig_no_netloc_schemes = set(_url.NO_NETLOC_SCHEMES) - - def tearDown(self): - _url.SCHEME_PORT_MAP = self._orig_scheme_port_map - _url.NO_NETLOC_SCHEMES = self._orig_no_netloc_schemes - - def test_register_scheme_basic(self): - register_scheme('deltron', uses_netloc=True, default_port=3030) - - u1 = URL.from_text('deltron://example.com') - assert u1.scheme == 'deltron' - assert u1.port == 3030 - assert u1.uses_netloc is True - - # test netloc works even when the original gives no indication - u2 = URL.from_text('deltron:') - u2 = u2.replace(host='example.com') - assert u2.to_text() == 'deltron://example.com' - - # test default port means no emission - u3 = URL.from_text('deltron://example.com:3030') - assert u3.to_text() == 'deltron://example.com' - - register_scheme('nonetron', default_port=3031) - u4 = URL(scheme='nonetron') - u4 = u4.replace(host='example.com') - assert u4.to_text() == 'nonetron://example.com' - - def test_register_no_netloc_scheme(self): - register_scheme('noloctron', uses_netloc=False) - u4 = URL(scheme='noloctron') - u4 = u4.replace(path=("example", "path")) - assert u4.to_text() == 'noloctron:example/path' - - def test_register_no_netloc_with_port(self): - with self.assertRaises(ValueError): - register_scheme('badnetlocless', uses_netloc=False, default_port=7) - - def test_invalid_uses_netloc(self): - with self.assertRaises(ValueError): - register_scheme('badnetloc', uses_netloc=None) - with self.assertRaises(ValueError): - register_scheme('badnetloc', uses_netloc=object()) - - def test_register_invalid_uses_netloc(self): - with self.assertRaises(ValueError): - register_scheme('lol', uses_netloc=lambda: 'nope') - - def test_register_invalid_port(self): - with self.assertRaises(ValueError): - register_scheme('nope', default_port=lambda: 'lol') diff --git a/hyperlink/test/test_url.py b/hyperlink/test/test_url.py deleted file mode 100644 index 09405857..00000000 --- a/hyperlink/test/test_url.py +++ /dev/null @@ -1,1210 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright (c) Twisted Matrix Laboratories. -# See LICENSE for details. - -from __future__ import unicode_literals - -import sys -import socket - -from .common import HyperlinkTestCase -from .. import URL, URLParseError -# automatically import the py27 windows implementation when appropriate -from .. import _url -from .._url import inet_pton, SCHEME_PORT_MAP, parse_host - - -PY2 = (sys.version_info[0] == 2) -unicode = type(u'') - - -BASIC_URL = "http://www.foo.com/a/nice/path/?zot=23&zut" - -# Examples from RFC 3986 section 5.4, Reference Resolution Examples -relativeLinkBaseForRFC3986 = 'http://a/b/c/d;p?q' -relativeLinkTestsForRFC3986 = [ - # "Normal" - # ('g:h', 'g:h'), # can't click on a scheme-having url without an abs path - ('g', 'http://a/b/c/g'), - ('./g', 'http://a/b/c/g'), - ('g/', 'http://a/b/c/g/'), - ('/g', 'http://a/g'), - ('//g', 'http://g'), - ('?y', 'http://a/b/c/d;p?y'), - ('g?y', 'http://a/b/c/g?y'), - ('#s', 'http://a/b/c/d;p?q#s'), - ('g#s', 'http://a/b/c/g#s'), - ('g?y#s', 'http://a/b/c/g?y#s'), - (';x', 'http://a/b/c/;x'), - ('g;x', 'http://a/b/c/g;x'), - ('g;x?y#s', 'http://a/b/c/g;x?y#s'), - ('', 'http://a/b/c/d;p?q'), - ('.', 'http://a/b/c/'), - ('./', 'http://a/b/c/'), - ('..', 'http://a/b/'), - ('../', 'http://a/b/'), - ('../g', 'http://a/b/g'), - ('../..', 'http://a/'), - ('../../', 'http://a/'), - ('../../g', 'http://a/g'), - - # Abnormal examples - # ".." cannot be used to change the authority component of a URI. - ('../../../g', 'http://a/g'), - ('../../../../g', 'http://a/g'), - - # Only include "." and ".." when they are only part of a larger segment, - # not by themselves. - ('/./g', 'http://a/g'), - ('/../g', 'http://a/g'), - ('g.', 'http://a/b/c/g.'), - ('.g', 'http://a/b/c/.g'), - ('g..', 'http://a/b/c/g..'), - ('..g', 'http://a/b/c/..g'), - # Unnecessary or nonsensical forms of "." and "..". - ('./../g', 'http://a/b/g'), - ('./g/.', 'http://a/b/c/g/'), - ('g/./h', 'http://a/b/c/g/h'), - ('g/../h', 'http://a/b/c/h'), - ('g;x=1/./y', 'http://a/b/c/g;x=1/y'), - ('g;x=1/../y', 'http://a/b/c/y'), - # Separating the reference's query and fragment components from the path. - ('g?y/./x', 'http://a/b/c/g?y/./x'), - ('g?y/../x', 'http://a/b/c/g?y/../x'), - ('g#s/./x', 'http://a/b/c/g#s/./x'), - ('g#s/../x', 'http://a/b/c/g#s/../x') -] - - -ROUNDTRIP_TESTS = ( - "http://localhost", - "http://localhost/", - "http://127.0.0.1/", - "http://[::127.0.0.1]/", - "http://[::1]/", - "http://localhost/foo", - "http://localhost/foo/", - "http://localhost/foo!!bar/", - "http://localhost/foo%20bar/", - "http://localhost/foo%2Fbar/", - "http://localhost/foo?n", - "http://localhost/foo?n=v", - "http://localhost/foo?n=/a/b", - "http://example.com/foo!@$bar?b!@z=123", - "http://localhost/asd?a=asd%20sdf/345", - "http://(%2525)/(%2525)?(%2525)&(%2525)=(%2525)#(%2525)", - "http://(%C3%A9)/(%C3%A9)?(%C3%A9)&(%C3%A9)=(%C3%A9)#(%C3%A9)", - "?sslrootcert=/Users/glyph/Downloads/rds-ca-2015-root.pem&sslmode=verify", - - # from boltons.urlutils' tests - - 'http://googlewebsite.com/e-shops.aspx', - 'http://example.com:8080/search?q=123&business=Nothing%20Special', - 'http://hatnote.com:9000/?arg=1&arg=2&arg=3', - 'https://xn--bcher-kva.ch', - 'http://xn--ggbla1c4e.xn--ngbc5azd/', - 'http://tools.ietf.org/html/rfc3986#section-3.4', - # 'http://wiki:pedia@hatnote.com', - 'ftp://ftp.rfc-editor.org/in-notes/tar/RFCs0001-0500.tar.gz', - 'http://[1080:0:0:0:8:800:200C:417A]/index.html', - 'ssh://192.0.2.16:2222/', - 'https://[::101.45.75.219]:80/?hi=bye', - 'ldap://[::192.9.5.5]/dc=example,dc=com??sub?(sn=Jensen)', - 'mailto:me@example.com?to=me@example.com&body=hi%20http://wikipedia.org', - 'news:alt.rec.motorcycle', - 'tel:+1-800-867-5309', - 'urn:oasis:member:A00024:x', - ('magnet:?xt=urn:btih:1a42b9e04e122b97a5254e3df77ab3c4b7da725f&dn=Puppy%' - '20Linux%20precise-5.7.1.iso&tr=udp://tracker.openbittorrent.com:80&' - 'tr=udp://tracker.publicbt.com:80&tr=udp://tracker.istole.it:6969&' - 'tr=udp://tracker.ccc.de:80&tr=udp://open.demonii.com:1337'), - - # percent-encoded delimiters in percent-encodable fields - - 'https://%3A@example.com/', # colon in username - 'https://%40@example.com/', # at sign in username - 'https://%2f@example.com/', # slash in username - 'https://a:%3a@example.com/', # colon in password - 'https://a:%40@example.com/', # at sign in password - 'https://a:%2f@example.com/', # slash in password - 'https://a:%3f@example.com/', # question mark in password - 'https://example.com/%2F/', # slash in path - 'https://example.com/%3F/', # question mark in path - 'https://example.com/%23/', # hash in path - 'https://example.com/?%23=b', # hash in query param name - 'https://example.com/?%3D=b', # equals in query param name - 'https://example.com/?%26=b', # ampersand in query param name - 'https://example.com/?a=%23', # hash in query param value - 'https://example.com/?a=%26', # ampersand in query param value - 'https://example.com/?a=%3D', # equals in query param value - # double-encoded percent sign in all percent-encodable positions: - "http://(%2525):(%2525)@example.com/(%2525)/?(%2525)=(%2525)#(%2525)", - # colon in first part of schemeless relative url - 'first_seg_rel_path__colon%3Anotok/second_seg__colon%3Aok', -) - - -class TestURL(HyperlinkTestCase): - """ - Tests for L{URL}. - """ - - def assertUnicoded(self, u): - """ - The given L{URL}'s components should be L{unicode}. - - @param u: The L{URL} to test. - """ - self.assertTrue(isinstance(u.scheme, unicode) or u.scheme is None, - repr(u)) - self.assertTrue(isinstance(u.host, unicode) or u.host is None, - repr(u)) - for seg in u.path: - self.assertEqual(type(seg), unicode, repr(u)) - for (k, v) in u.query: - self.assertEqual(type(seg), unicode, repr(u)) - self.assertTrue(v is None or isinstance(v, unicode), repr(u)) - self.assertEqual(type(u.fragment), unicode, repr(u)) - - def assertURL(self, u, scheme, host, path, query, - fragment, port, userinfo=''): - """ - The given L{URL} should have the given components. - - @param u: The actual L{URL} to examine. - - @param scheme: The expected scheme. - - @param host: The expected host. - - @param path: The expected path. - - @param query: The expected query. - - @param fragment: The expected fragment. - - @param port: The expected port. - - @param userinfo: The expected userinfo. - """ - actual = (u.scheme, u.host, u.path, u.query, - u.fragment, u.port, u.userinfo) - expected = (scheme, host, tuple(path), tuple(query), - fragment, port, u.userinfo) - self.assertEqual(actual, expected) - - def test_initDefaults(self): - """ - L{URL} should have appropriate default values. - """ - def check(u): - self.assertUnicoded(u) - self.assertURL(u, 'http', '', [], [], '', 80, '') - - check(URL('http', '')) - check(URL('http', '', [], [])) - check(URL('http', '', [], [], '')) - - def test_init(self): - """ - L{URL} should accept L{unicode} parameters. - """ - u = URL('s', 'h', ['p'], [('k', 'v'), ('k', None)], 'f') - self.assertUnicoded(u) - self.assertURL(u, 's', 'h', ['p'], [('k', 'v'), ('k', None)], - 'f', None) - - self.assertURL(URL('http', '\xe0', ['\xe9'], - [('\u03bb', '\u03c0')], '\u22a5'), - 'http', '\xe0', ['\xe9'], - [('\u03bb', '\u03c0')], '\u22a5', 80) - - def test_initPercent(self): - """ - L{URL} should accept (and not interpret) percent characters. - """ - u = URL('s', '%68', ['%70'], [('%6B', '%76'), ('%6B', None)], - '%66') - self.assertUnicoded(u) - self.assertURL(u, - 's', '%68', ['%70'], - [('%6B', '%76'), ('%6B', None)], - '%66', None) - - def test_repr(self): - """ - L{URL.__repr__} will display the canonical form of the URL, wrapped in - a L{URL.from_text} invocation, so that it is C{eval}-able but still easy - to read. - """ - self.assertEqual( - repr(URL(scheme='http', host='foo', path=['bar'], - query=[('baz', None), ('k', 'v')], - fragment='frob')), - "URL.from_text(%s)" % (repr(u"http://foo/bar?baz&k=v#frob"),) - ) - - def test_from_text(self): - """ - Round-tripping L{URL.from_text} with C{str} results in an equivalent - URL. - """ - urlpath = URL.from_text(BASIC_URL) - self.assertEqual(BASIC_URL, urlpath.to_text()) - - def test_roundtrip(self): - """ - L{URL.to_text} should invert L{URL.from_text}. - """ - for test in ROUNDTRIP_TESTS: - result = URL.from_text(test).to_text(with_password=True) - self.assertEqual(test, result) - - def test_roundtrip_double_iri(self): - for test in ROUNDTRIP_TESTS: - url = URL.from_text(test) - iri = url.to_iri() - double_iri = iri.to_iri() - assert iri == double_iri - - iri_text = iri.to_text(with_password=True) - double_iri_text = double_iri.to_text(with_password=True) - assert iri_text == double_iri_text - return - - def test_equality(self): - """ - Two URLs decoded using L{URL.from_text} will be equal (C{==}) if they - decoded same URL string, and unequal (C{!=}) if they decoded different - strings. - """ - urlpath = URL.from_text(BASIC_URL) - self.assertEqual(urlpath, URL.from_text(BASIC_URL)) - self.assertNotEqual( - urlpath, - URL.from_text('ftp://www.anotherinvaliddomain.com/' - 'foo/bar/baz/?zot=21&zut') - ) - - def test_fragmentEquality(self): - """ - An URL created with the empty string for a fragment compares equal - to an URL created with an unspecified fragment. - """ - self.assertEqual(URL(fragment=''), URL()) - self.assertEqual(URL.from_text(u"http://localhost/#"), - URL.from_text(u"http://localhost/")) - - def test_child(self): - """ - L{URL.child} appends a new path segment, but does not affect the query - or fragment. - """ - urlpath = URL.from_text(BASIC_URL) - self.assertEqual("http://www.foo.com/a/nice/path/gong?zot=23&zut", - urlpath.child('gong').to_text()) - self.assertEqual("http://www.foo.com/a/nice/path/gong%2F?zot=23&zut", - urlpath.child('gong/').to_text()) - self.assertEqual( - "http://www.foo.com/a/nice/path/gong%2Fdouble?zot=23&zut", - urlpath.child('gong/double').to_text() - ) - self.assertEqual( - "http://www.foo.com/a/nice/path/gong%2Fdouble%2F?zot=23&zut", - urlpath.child('gong/double/').to_text() - ) - - def test_multiChild(self): - """ - L{URL.child} receives multiple segments as C{*args} and appends each in - turn. - """ - url = URL.from_text('http://example.com/a/b') - self.assertEqual(url.child('c', 'd', 'e').to_text(), - 'http://example.com/a/b/c/d/e') - - def test_childInitRoot(self): - """ - L{URL.child} of a L{URL} without a path produces a L{URL} with a single - path segment. - """ - childURL = URL(host=u"www.foo.com").child(u"c") - self.assertTrue(childURL.rooted) - self.assertEqual("http://www.foo.com/c", childURL.to_text()) - - def test_emptyChild(self): - """ - L{URL.child} without any new segments returns the original L{URL}. - """ - url = URL(host=u"www.foo.com") - self.assertEqual(url.child(), url) - - def test_sibling(self): - """ - L{URL.sibling} of a L{URL} replaces the last path segment, but does not - affect the query or fragment. - """ - urlpath = URL.from_text(BASIC_URL) - self.assertEqual( - "http://www.foo.com/a/nice/path/sister?zot=23&zut", - urlpath.sibling('sister').to_text() - ) - # Use an url without trailing '/' to check child removal. - url_text = "http://www.foo.com/a/nice/path?zot=23&zut" - urlpath = URL.from_text(url_text) - self.assertEqual( - "http://www.foo.com/a/nice/sister?zot=23&zut", - urlpath.sibling('sister').to_text() - ) - - def test_click(self): - """ - L{URL.click} interprets the given string as a relative URI-reference - and returns a new L{URL} interpreting C{self} as the base absolute URI. - """ - urlpath = URL.from_text(BASIC_URL) - # A null uri should be valid (return here). - self.assertEqual("http://www.foo.com/a/nice/path/?zot=23&zut", - urlpath.click("").to_text()) - # A simple relative path remove the query. - self.assertEqual("http://www.foo.com/a/nice/path/click", - urlpath.click("click").to_text()) - # An absolute path replace path and query. - self.assertEqual("http://www.foo.com/click", - urlpath.click("/click").to_text()) - # Replace just the query. - self.assertEqual("http://www.foo.com/a/nice/path/?burp", - urlpath.click("?burp").to_text()) - # One full url to another should not generate '//' between authority. - # and path - self.assertTrue("//foobar" not in - urlpath.click('http://www.foo.com/foobar').to_text()) - - # From a url with no query clicking a url with a query, the query - # should be handled properly. - u = URL.from_text('http://www.foo.com/me/noquery') - self.assertEqual('http://www.foo.com/me/17?spam=158', - u.click('/me/17?spam=158').to_text()) - - # Check that everything from the path onward is removed when the click - # link has no path. - u = URL.from_text('http://localhost/foo?abc=def') - self.assertEqual(u.click('http://www.python.org').to_text(), - 'http://www.python.org') - - # https://twistedmatrix.com/trac/ticket/8184 - u = URL.from_text('http://hatnote.com/a/b/../c/./d/e/..') - res = 'http://hatnote.com/a/c/d/' - self.assertEqual(u.click('').to_text(), res) - - # test click default arg is same as empty string above - self.assertEqual(u.click().to_text(), res) - - # test click on a URL instance - u = URL.fromText('http://localhost/foo/?abc=def') - u2 = URL.from_text('bar') - u3 = u.click(u2) - self.assertEqual(u3.to_text(), 'http://localhost/foo/bar') - - def test_clickRFC3986(self): - """ - L{URL.click} should correctly resolve the examples in RFC 3986. - """ - base = URL.from_text(relativeLinkBaseForRFC3986) - for (ref, expected) in relativeLinkTestsForRFC3986: - self.assertEqual(base.click(ref).to_text(), expected) - - def test_clickSchemeRelPath(self): - """ - L{URL.click} should not accept schemes with relative paths. - """ - base = URL.from_text(relativeLinkBaseForRFC3986) - self.assertRaises(NotImplementedError, base.click, 'g:h') - self.assertRaises(NotImplementedError, base.click, 'http:h') - - def test_cloneUnchanged(self): - """ - Verify that L{URL.replace} doesn't change any of the arguments it - is passed. - """ - urlpath = URL.from_text('https://x:1/y?z=1#A') - self.assertEqual(urlpath.replace(urlpath.scheme, - urlpath.host, - urlpath.path, - urlpath.query, - urlpath.fragment, - urlpath.port), - urlpath) - self.assertEqual(urlpath.replace(), urlpath) - - def test_clickCollapse(self): - """ - L{URL.click} collapses C{.} and C{..} according to RFC 3986 section - 5.2.4. - """ - tests = [ - ['http://localhost/', '.', 'http://localhost/'], - ['http://localhost/', '..', 'http://localhost/'], - ['http://localhost/a/b/c', '.', 'http://localhost/a/b/'], - ['http://localhost/a/b/c', '..', 'http://localhost/a/'], - ['http://localhost/a/b/c', './d/e', 'http://localhost/a/b/d/e'], - ['http://localhost/a/b/c', '../d/e', 'http://localhost/a/d/e'], - ['http://localhost/a/b/c', '/./d/e', 'http://localhost/d/e'], - ['http://localhost/a/b/c', '/../d/e', 'http://localhost/d/e'], - ['http://localhost/a/b/c/', '../../d/e/', - 'http://localhost/a/d/e/'], - ['http://localhost/a/./c', '../d/e', 'http://localhost/d/e'], - ['http://localhost/a/./c/', '../d/e', 'http://localhost/a/d/e'], - ['http://localhost/a/b/c/d', './e/../f/../g', - 'http://localhost/a/b/c/g'], - ['http://localhost/a/b/c', 'd//e', 'http://localhost/a/b/d//e'], - ] - for start, click, expected in tests: - actual = URL.from_text(start).click(click).to_text() - self.assertEqual( - actual, - expected, - "{start}.click({click}) => {actual} not {expected}".format( - start=start, - click=repr(click), - actual=actual, - expected=expected, - ) - ) - - def test_queryAdd(self): - """ - L{URL.add} adds query parameters. - """ - self.assertEqual( - "http://www.foo.com/a/nice/path/?foo=bar", - URL.from_text("http://www.foo.com/a/nice/path/") - .add(u"foo", u"bar").to_text()) - self.assertEqual( - "http://www.foo.com/?foo=bar", - URL(host=u"www.foo.com").add(u"foo", u"bar") - .to_text()) - urlpath = URL.from_text(BASIC_URL) - self.assertEqual( - "http://www.foo.com/a/nice/path/?zot=23&zut&burp", - urlpath.add(u"burp").to_text()) - self.assertEqual( - "http://www.foo.com/a/nice/path/?zot=23&zut&burp=xxx", - urlpath.add(u"burp", u"xxx").to_text()) - self.assertEqual( - "http://www.foo.com/a/nice/path/?zot=23&zut&burp=xxx&zing", - urlpath.add(u"burp", u"xxx").add(u"zing").to_text()) - # Note the inversion! - self.assertEqual( - "http://www.foo.com/a/nice/path/?zot=23&zut&zing&burp=xxx", - urlpath.add(u"zing").add(u"burp", u"xxx").to_text()) - # Note the two values for the same name. - self.assertEqual( - "http://www.foo.com/a/nice/path/?zot=23&zut&burp=xxx&zot=32", - urlpath.add(u"burp", u"xxx").add(u"zot", '32') - .to_text()) - - def test_querySet(self): - """ - L{URL.set} replaces query parameters by name. - """ - urlpath = URL.from_text(BASIC_URL) - self.assertEqual( - "http://www.foo.com/a/nice/path/?zot=32&zut", - urlpath.set(u"zot", '32').to_text()) - # Replace name without value with name/value and vice-versa. - self.assertEqual( - "http://www.foo.com/a/nice/path/?zot&zut=itworked", - urlpath.set(u"zot").set(u"zut", u"itworked").to_text() - ) - # Q: what happens when the query has two values and we replace? - # A: we replace both values with a single one - self.assertEqual( - "http://www.foo.com/a/nice/path/?zot=32&zut", - urlpath.add(u"zot", u"xxx").set(u"zot", '32').to_text() - ) - - def test_queryRemove(self): - """ - L{URL.remove} removes instances of a query parameter. - """ - url = URL.from_text(u"https://example.com/a/b/?foo=1&bar=2&foo=3") - self.assertEqual( - url.remove(u"foo"), - URL.from_text(u"https://example.com/a/b/?bar=2") - ) - - self.assertEqual( - url.remove(name=u"foo", value=u"1"), - URL.from_text(u"https://example.com/a/b/?bar=2&foo=3") - ) - - self.assertEqual( - url.remove(name=u"foo", limit=1), - URL.from_text(u"https://example.com/a/b/?bar=2&foo=3") - ) - - self.assertEqual( - url.remove(name=u"foo", value=u"1", limit=0), - URL.from_text(u"https://example.com/a/b/?foo=1&bar=2&foo=3") - ) - - def test_parseEqualSignInParamValue(self): - """ - Every C{=}-sign after the first in a query parameter is simply included - in the value of the parameter. - """ - u = URL.from_text('http://localhost/?=x=x=x') - self.assertEqual(u.get(''), ['x=x=x']) - self.assertEqual(u.to_text(), 'http://localhost/?=x=x=x') - u = URL.from_text('http://localhost/?foo=x=x=x&bar=y') - self.assertEqual(u.query, (('foo', 'x=x=x'), ('bar', 'y'))) - self.assertEqual(u.to_text(), 'http://localhost/?foo=x=x=x&bar=y') - - u = URL.from_text('https://example.com/?argument=3&argument=4&operator=%3D') - iri = u.to_iri() - self.assertEqual(iri.get('operator'), ['=']) - # assert that the equals is not unnecessarily escaped - self.assertEqual(iri.to_uri().get('operator'), ['=']) - - def test_empty(self): - """ - An empty L{URL} should serialize as the empty string. - """ - self.assertEqual(URL().to_text(), '') - - def test_justQueryText(self): - """ - An L{URL} with query text should serialize as just query text. - """ - u = URL(query=[(u"hello", u"world")]) - self.assertEqual(u.to_text(), '?hello=world') - - def test_identicalEqual(self): - """ - L{URL} compares equal to itself. - """ - u = URL.from_text('http://localhost/') - self.assertEqual(u, u) - - def test_similarEqual(self): - """ - URLs with equivalent components should compare equal. - """ - u1 = URL.from_text('http://u@localhost:8080/p/a/t/h?q=p#f') - u2 = URL.from_text('http://u@localhost:8080/p/a/t/h?q=p#f') - self.assertEqual(u1, u2) - - def test_differentNotEqual(self): - """ - L{URL}s that refer to different resources are both unequal (C{!=}) and - also not equal (not C{==}). - """ - u1 = URL.from_text('http://localhost/a') - u2 = URL.from_text('http://localhost/b') - self.assertFalse(u1 == u2, "%r != %r" % (u1, u2)) - self.assertNotEqual(u1, u2) - - def test_otherTypesNotEqual(self): - """ - L{URL} is not equal (C{==}) to other types. - """ - u = URL.from_text('http://localhost/') - self.assertFalse(u == 42, "URL must not equal a number.") - self.assertFalse(u == object(), "URL must not equal an object.") - self.assertNotEqual(u, 42) - self.assertNotEqual(u, object()) - - def test_identicalNotUnequal(self): - """ - Identical L{URL}s are not unequal (C{!=}) to each other. - """ - u = URL.from_text('http://u@localhost:8080/p/a/t/h?q=p#f') - self.assertFalse(u != u, "%r == itself" % u) - - def test_similarNotUnequal(self): - """ - Structurally similar L{URL}s are not unequal (C{!=}) to each other. - """ - u1 = URL.from_text('http://u@localhost:8080/p/a/t/h?q=p#f') - u2 = URL.from_text('http://u@localhost:8080/p/a/t/h?q=p#f') - self.assertFalse(u1 != u2, "%r == %r" % (u1, u2)) - - def test_differentUnequal(self): - """ - Structurally different L{URL}s are unequal (C{!=}) to each other. - """ - u1 = URL.from_text('http://localhost/a') - u2 = URL.from_text('http://localhost/b') - self.assertTrue(u1 != u2, "%r == %r" % (u1, u2)) - - def test_otherTypesUnequal(self): - """ - L{URL} is unequal (C{!=}) to other types. - """ - u = URL.from_text('http://localhost/') - self.assertTrue(u != 42, "URL must differ from a number.") - self.assertTrue(u != object(), "URL must be differ from an object.") - - def test_asURI(self): - """ - L{URL.asURI} produces an URI which converts any URI unicode encoding - into pure US-ASCII and returns a new L{URL}. - """ - unicodey = ('http://\N{LATIN SMALL LETTER E WITH ACUTE}.com/' - '\N{LATIN SMALL LETTER E}\N{COMBINING ACUTE ACCENT}' - '?\N{LATIN SMALL LETTER A}\N{COMBINING ACUTE ACCENT}=' - '\N{LATIN SMALL LETTER I}\N{COMBINING ACUTE ACCENT}' - '#\N{LATIN SMALL LETTER U}\N{COMBINING ACUTE ACCENT}') - iri = URL.from_text(unicodey) - uri = iri.asURI() - self.assertEqual(iri.host, '\N{LATIN SMALL LETTER E WITH ACUTE}.com') - self.assertEqual(iri.path[0], - '\N{LATIN SMALL LETTER E}\N{COMBINING ACUTE ACCENT}') - self.assertEqual(iri.to_text(), unicodey) - expectedURI = 'http://xn--9ca.com/%C3%A9?%C3%A1=%C3%AD#%C3%BA' - actualURI = uri.to_text() - self.assertEqual(actualURI, expectedURI, - '%r != %r' % (actualURI, expectedURI)) - - def test_asIRI(self): - """ - L{URL.asIRI} decodes any percent-encoded text in the URI, making it - more suitable for reading by humans, and returns a new L{URL}. - """ - asciiish = 'http://xn--9ca.com/%C3%A9?%C3%A1=%C3%AD#%C3%BA' - uri = URL.from_text(asciiish) - iri = uri.asIRI() - self.assertEqual(uri.host, 'xn--9ca.com') - self.assertEqual(uri.path[0], '%C3%A9') - self.assertEqual(uri.to_text(), asciiish) - expectedIRI = ('http://\N{LATIN SMALL LETTER E WITH ACUTE}.com/' - '\N{LATIN SMALL LETTER E WITH ACUTE}' - '?\N{LATIN SMALL LETTER A WITH ACUTE}=' - '\N{LATIN SMALL LETTER I WITH ACUTE}' - '#\N{LATIN SMALL LETTER U WITH ACUTE}') - actualIRI = iri.to_text() - self.assertEqual(actualIRI, expectedIRI, - '%r != %r' % (actualIRI, expectedIRI)) - - def test_badUTF8AsIRI(self): - """ - Bad UTF-8 in a path segment, query parameter, or fragment results in - that portion of the URI remaining percent-encoded in the IRI. - """ - urlWithBinary = 'http://xn--9ca.com/%00%FF/%C3%A9' - uri = URL.from_text(urlWithBinary) - iri = uri.asIRI() - expectedIRI = ('http://\N{LATIN SMALL LETTER E WITH ACUTE}.com/' - '%00%FF/' - '\N{LATIN SMALL LETTER E WITH ACUTE}') - actualIRI = iri.to_text() - self.assertEqual(actualIRI, expectedIRI, - '%r != %r' % (actualIRI, expectedIRI)) - - def test_alreadyIRIAsIRI(self): - """ - A L{URL} composed of non-ASCII text will result in non-ASCII text. - """ - unicodey = ('http://\N{LATIN SMALL LETTER E WITH ACUTE}.com/' - '\N{LATIN SMALL LETTER E}\N{COMBINING ACUTE ACCENT}' - '?\N{LATIN SMALL LETTER A}\N{COMBINING ACUTE ACCENT}=' - '\N{LATIN SMALL LETTER I}\N{COMBINING ACUTE ACCENT}' - '#\N{LATIN SMALL LETTER U}\N{COMBINING ACUTE ACCENT}') - iri = URL.from_text(unicodey) - alsoIRI = iri.asIRI() - self.assertEqual(alsoIRI.to_text(), unicodey) - - def test_alreadyURIAsURI(self): - """ - A L{URL} composed of encoded text will remain encoded. - """ - expectedURI = 'http://xn--9ca.com/%C3%A9?%C3%A1=%C3%AD#%C3%BA' - uri = URL.from_text(expectedURI) - actualURI = uri.asURI().to_text() - self.assertEqual(actualURI, expectedURI) - - def test_userinfo(self): - """ - L{URL.from_text} will parse the C{userinfo} portion of the URI - separately from the host and port. - """ - url = URL.from_text( - 'http://someuser:somepassword@example.com/some-segment@ignore' - ) - self.assertEqual(url.authority(True), - 'someuser:somepassword@example.com') - self.assertEqual(url.authority(False), 'someuser:@example.com') - self.assertEqual(url.userinfo, 'someuser:somepassword') - self.assertEqual(url.user, 'someuser') - self.assertEqual(url.to_text(), - 'http://someuser:@example.com/some-segment@ignore') - self.assertEqual( - url.replace(userinfo=u"someuser").to_text(), - 'http://someuser@example.com/some-segment@ignore' - ) - - def test_portText(self): - """ - L{URL.from_text} parses custom port numbers as integers. - """ - portURL = URL.from_text(u"http://www.example.com:8080/") - self.assertEqual(portURL.port, 8080) - self.assertEqual(portURL.to_text(), u"http://www.example.com:8080/") - - def test_mailto(self): - """ - Although L{URL} instances are mainly for dealing with HTTP, other - schemes (such as C{mailto:}) should work as well. For example, - L{URL.from_text}/L{URL.to_text} round-trips cleanly for a C{mailto:} URL - representing an email address. - """ - self.assertEqual(URL.from_text(u"mailto:user@example.com").to_text(), - u"mailto:user@example.com") - - def test_queryIterable(self): - """ - When a L{URL} is created with a C{query} argument, the C{query} - argument is converted into an N-tuple of 2-tuples, sensibly - handling dictionaries. - """ - expected = (('alpha', 'beta'),) - url = URL(query=[['alpha', 'beta']]) - self.assertEqual(url.query, expected) - url = URL(query={'alpha': 'beta'}) - self.assertEqual(url.query, expected) - - def test_pathIterable(self): - """ - When a L{URL} is created with a C{path} argument, the C{path} is - converted into a tuple. - """ - url = URL(path=['hello', 'world']) - self.assertEqual(url.path, ('hello', 'world')) - - def test_invalidArguments(self): - """ - Passing an argument of the wrong type to any of the constructor - arguments of L{URL} will raise a descriptive L{TypeError}. - - L{URL} typechecks very aggressively to ensure that its constitutent - parts are all properly immutable and to prevent confusing errors when - bad data crops up in a method call long after the code that called the - constructor is off the stack. - """ - class Unexpected(object): - def __str__(self): - return "wrong" - - def __repr__(self): - return "" - - defaultExpectation = "unicode" if bytes is str else "str" - - def assertRaised(raised, expectation, name): - self.assertEqual(str(raised.exception), - "expected {0} for {1}, got {2}".format( - expectation, - name, "")) - - def check(param, expectation=defaultExpectation): - with self.assertRaises(TypeError) as raised: - URL(**{param: Unexpected()}) - - assertRaised(raised, expectation, param) - - check("scheme") - check("host") - check("fragment") - check("rooted", "bool") - check("userinfo") - check("port", "int or NoneType") - - with self.assertRaises(TypeError) as raised: - URL(path=[Unexpected()]) - - assertRaised(raised, defaultExpectation, "path segment") - - with self.assertRaises(TypeError) as raised: - URL(query=[(u"name", Unexpected())]) - - assertRaised(raised, defaultExpectation + " or NoneType", - "query parameter value") - - with self.assertRaises(TypeError) as raised: - URL(query=[(Unexpected(), u"value")]) - - assertRaised(raised, defaultExpectation, "query parameter name") - # No custom error message for this one, just want to make sure - # non-2-tuples don't get through. - - with self.assertRaises(TypeError): - URL(query=[Unexpected()]) - - with self.assertRaises(ValueError): - URL(query=[('k', 'v', 'vv')]) - - with self.assertRaises(ValueError): - URL(query=[('k',)]) - - url = URL.from_text("https://valid.example.com/") - with self.assertRaises(TypeError) as raised: - url.child(Unexpected()) - assertRaised(raised, defaultExpectation, "path segment") - with self.assertRaises(TypeError) as raised: - url.sibling(Unexpected()) - assertRaised(raised, defaultExpectation, "path segment") - with self.assertRaises(TypeError) as raised: - url.click(Unexpected()) - assertRaised(raised, defaultExpectation, "relative URL") - - def test_technicallyTextIsIterableBut(self): - """ - Technically, L{str} (or L{unicode}, as appropriate) is iterable, but - C{URL(path="foo")} resulting in C{URL.from_text("f/o/o")} is never what - you want. - """ - with self.assertRaises(TypeError) as raised: - URL(path='foo') - self.assertEqual( - str(raised.exception), - "expected iterable of text for path, not: {0}" - .format(repr('foo')) - ) - - def test_netloc(self): - url = URL(scheme='https') - self.assertEqual(url.uses_netloc, True) - - url = URL(scheme='git+https') - self.assertEqual(url.uses_netloc, True) - - url = URL(scheme='mailto') - self.assertEqual(url.uses_netloc, False) - - url = URL(scheme='ztp') - self.assertEqual(url.uses_netloc, None) - - url = URL.from_text('ztp://test.com') - self.assertEqual(url.uses_netloc, True) - - url = URL.from_text('ztp:test:com') - self.assertEqual(url.uses_netloc, False) - - def test_ipv6_with_port(self): - t = 'https://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]:80/' - url = URL.from_text(t) - assert url.host == '2001:0db8:85a3:0000:0000:8a2e:0370:7334' - assert url.port == 80 - assert SCHEME_PORT_MAP[url.scheme] != url.port - - def test_basic(self): - text = 'https://user:pass@example.com/path/to/here?k=v#nice' - url = URL.from_text(text) - assert url.scheme == 'https' - assert url.userinfo == 'user:pass' - assert url.host == 'example.com' - assert url.path == ('path', 'to', 'here') - assert url.fragment == 'nice' - - text = 'https://user:pass@127.0.0.1/path/to/here?k=v#nice' - url = URL.from_text(text) - assert url.scheme == 'https' - assert url.userinfo == 'user:pass' - assert url.host == '127.0.0.1' - assert url.path == ('path', 'to', 'here') - - text = 'https://user:pass@[::1]/path/to/here?k=v#nice' - url = URL.from_text(text) - assert url.scheme == 'https' - assert url.userinfo == 'user:pass' - assert url.host == '::1' - assert url.path == ('path', 'to', 'here') - - def test_invalid_url(self): - self.assertRaises(URLParseError, URL.from_text, '#\n\n') - - def test_invalid_authority_url(self): - self.assertRaises(URLParseError, URL.from_text, 'http://abc:\n\n/#') - - def test_invalid_ipv6(self): - invalid_ipv6_ips = ['2001::0234:C1ab::A0:aabc:003F', - '2001::1::3F', - ':', - '::::', - '::256.0.0.1'] - for ip in invalid_ipv6_ips: - url_text = 'http://[' + ip + ']' - self.assertRaises(socket.error, inet_pton, - socket.AF_INET6, ip) - self.assertRaises(URLParseError, URL.from_text, url_text) - - def test_invalid_port(self): - self.assertRaises(URLParseError, URL.from_text, 'ftp://portmouth:smash') - self.assertRaises(ValueError, URL.from_text, - 'http://reader.googlewebsite.com:neverforget') - - def test_idna(self): - u1 = URL.from_text('http://bücher.ch') - self.assertEquals(u1.host, 'bücher.ch') - self.assertEquals(u1.to_text(), 'http://bücher.ch') - self.assertEquals(u1.to_uri().to_text(), 'http://xn--bcher-kva.ch') - - u2 = URL.from_text('https://xn--bcher-kva.ch') - self.assertEquals(u2.host, 'xn--bcher-kva.ch') - self.assertEquals(u2.to_text(), 'https://xn--bcher-kva.ch') - self.assertEquals(u2.to_iri().to_text(), u'https://bücher.ch') - - def test_netloc_slashes(self): - # basic sanity checks - url = URL.from_text('mailto:mahmoud@hatnote.com') - self.assertEquals(url.scheme, 'mailto') - self.assertEquals(url.to_text(), 'mailto:mahmoud@hatnote.com') - - url = URL.from_text('http://hatnote.com') - self.assertEquals(url.scheme, 'http') - self.assertEquals(url.to_text(), 'http://hatnote.com') - - # test that unrecognized schemes stay consistent with '//' - url = URL.from_text('newscheme:a:b:c') - self.assertEquals(url.scheme, 'newscheme') - self.assertEquals(url.to_text(), 'newscheme:a:b:c') - - url = URL.from_text('newerscheme://a/b/c') - self.assertEquals(url.scheme, 'newerscheme') - self.assertEquals(url.to_text(), 'newerscheme://a/b/c') - - # test that reasonable guesses are made - url = URL.from_text('git+ftp://gitstub.biz/glyph/lefkowitz') - self.assertEquals(url.scheme, 'git+ftp') - self.assertEquals(url.to_text(), - 'git+ftp://gitstub.biz/glyph/lefkowitz') - - url = URL.from_text('what+mailto:freerealestate@enotuniq.org') - self.assertEquals(url.scheme, 'what+mailto') - self.assertEquals(url.to_text(), - 'what+mailto:freerealestate@enotuniq.org') - - url = URL(scheme='ztp', path=('x', 'y', 'z'), rooted=True) - self.assertEquals(url.to_text(), 'ztp:/x/y/z') - - # also works when the input doesn't include '//' - url = URL(scheme='git+ftp', path=('x', 'y', 'z' ,''), - rooted=True, uses_netloc=True) - # broken bc urlunsplit - self.assertEquals(url.to_text(), 'git+ftp:///x/y/z/') - - # really why would this ever come up but ok - url = URL.from_text('file:///path/to/heck') - url2 = url.replace(scheme='mailto') - self.assertEquals(url2.to_text(), 'mailto:/path/to/heck') - - url_text = 'unregisteredscheme:///a/b/c' - url = URL.from_text(url_text) - no_netloc_url = url.replace(uses_netloc=False) - self.assertEquals(no_netloc_url.to_text(), 'unregisteredscheme:/a/b/c') - netloc_url = url.replace(uses_netloc=True) - self.assertEquals(netloc_url.to_text(), url_text) - - return - - def test_wrong_constructor(self): - with self.assertRaises(ValueError): - # whole URL not allowed - URL(BASIC_URL) - with self.assertRaises(ValueError): - # explicitly bad scheme not allowed - URL('HTTP_____more_like_imHoTTeP') - - def test_encoded_userinfo(self): - url = URL.from_text('http://user:pass@example.com') - assert url.userinfo == 'user:pass' - url = url.replace(userinfo='us%20her:pass') - iri = url.to_iri() - assert iri.to_text(with_password=True) == 'http://us her:pass@example.com' - assert iri.to_text(with_password=False) == 'http://us her:@example.com' - assert iri.to_uri().to_text(with_password=True) == 'http://us%20her:pass@example.com' - - def test_hash(self): - url_map = {} - url1 = URL.from_text('http://blog.hatnote.com/ask?utm_source=geocity') - assert hash(url1) == hash(url1) # sanity - - url_map[url1] = 1 - - url2 = URL.from_text('http://blog.hatnote.com/ask') - url2 = url2.set('utm_source', 'geocity') - - url_map[url2] = 2 - - assert len(url_map) == 1 - assert list(url_map.values()) == [2] - - assert hash(URL()) == hash(URL()) # slightly more sanity - - def test_dir(self): - url = URL() - res = dir(url) - - assert len(res) > 15 - # twisted compat - assert 'fromText' not in res - assert 'asText' not in res - assert 'asURI' not in res - assert 'asIRI' not in res - - def test_twisted_compat(self): - url = URL.fromText(u'http://example.com/a%20té%C3%A9st') - assert url.asText() == 'http://example.com/a%20té%C3%A9st' - assert url.asURI().asText() == 'http://example.com/a%20t%C3%A9%C3%A9st' - # TODO: assert url.asIRI().asText() == u'http://example.com/a%20téést' - - def test_set_ordering(self): - # TODO - url = URL.from_text('http://example.com/?a=b&c') - url = url.set(u'x', u'x') - url = url.add(u'x', u'y') - assert url.to_text() == u'http://example.com/?a=b&x=x&c&x=y' - # Would expect: - # assert url.to_text() == u'http://example.com/?a=b&c&x=x&x=y' - - def test_schemeless_path(self): - "See issue #4" - u1 = URL.from_text("urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob") - u2 = URL.from_text(u1.to_text()) - assert u1 == u2 # sanity testing roundtripping - - u3 = URL.from_text(u1.to_iri().to_text()) - assert u1 == u3 - assert u2 == u3 - - # test that colons are ok past the first segment - u4 = URL.from_text("first-segment/urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob") - u5 = u4.to_iri() - assert u5.to_text() == u'first-segment/urn:ietf:wg:oauth:2.0:oob' - - u6 = URL.from_text(u5.to_text()).to_uri() - assert u5 == u6 # colons stay decoded bc they're not in the first seg - - def test_emoji_domain(self): - "See issue #7, affecting only narrow builds (2.6-3.3)" - url = URL.from_text('https://xn--vi8hiv.ws') - iri = url.to_iri() - iri.to_text() - # as long as we don't get ValueErrors, we're good - - def test_delim_in_param(self): - "Per issue #6 and #8" - self.assertRaises(ValueError, URL, scheme=u'http', host=u'a/c') - self.assertRaises(ValueError, URL, path=(u"?",)) - self.assertRaises(ValueError, URL, path=(u"#",)) - self.assertRaises(ValueError, URL, query=((u"&", "test"))) - - def test_empty_paths_eq(self): - u1 = URL.from_text('http://example.com/') - u2 = URL.from_text('http://example.com') - - assert u1 == u2 - - u1 = URL.from_text('http://example.com') - u2 = URL.from_text('http://example.com') - - assert u1 == u2 - - u1 = URL.from_text('http://example.com') - u2 = URL.from_text('http://example.com/') - - assert u1 == u2 - - u1 = URL.from_text('http://example.com/') - u2 = URL.from_text('http://example.com/') - - assert u1 == u2 - - def test_from_text_type(self): - assert URL.from_text(u'#ok').fragment == u'ok' # sanity - self.assertRaises(TypeError, URL.from_text, b'bytes://x.y.z') - self.assertRaises(TypeError, URL.from_text, object()) - - def test_from_text_bad_authority(self): - # bad ipv6 brackets - self.assertRaises(URLParseError, URL.from_text, 'http://[::1/') - self.assertRaises(URLParseError, URL.from_text, 'http://::1]/') - self.assertRaises(URLParseError, URL.from_text, 'http://[[::1]/') - self.assertRaises(URLParseError, URL.from_text, 'http://[::1]]/') - - # empty port - self.assertRaises(URLParseError, URL.from_text, 'http://127.0.0.1:') - # non-integer port - self.assertRaises(URLParseError, URL.from_text, 'http://127.0.0.1:hi') - # extra port colon (makes for an invalid host) - self.assertRaises(URLParseError, URL.from_text, 'http://127.0.0.1::80') - - def test_normalize(self): - url = URL.from_text('HTTP://Example.com/A%61/./../A%61?B%62=C%63#D%64') - assert url.get('Bb') == [] - assert url.get('B%62') == ['C%63'] - assert len(url.path) == 4 - - # test that most expected normalizations happen - norm_url = url.normalize() - - assert norm_url.scheme == 'http' - assert norm_url.host == 'example.com' - assert norm_url.path == ('Aa',) - assert norm_url.get('Bb') == ['Cc'] - assert norm_url.fragment == 'Dd' - assert norm_url.to_text() == 'http://example.com/Aa?Bb=Cc#Dd' - - # test that flags work - noop_norm_url = url.normalize(scheme=False, host=False, - path=False, query=False, fragment=False) - assert noop_norm_url == url - - # test that empty paths get at least one slash - slashless_url = URL.from_text('http://example.io') - slashful_url = slashless_url.normalize() - assert slashful_url.to_text() == 'http://example.io/' - - # test case normalization for percent encoding - delimited_url = URL.from_text('/a%2fb/cd%3f?k%3d=v%23#test') - norm_delimited_url = delimited_url.normalize() - assert norm_delimited_url.to_text() == '/a%2Fb/cd%3F?k%3D=v%23#test' - - # test invalid percent encoding during normalize - assert URL(path=('', '%te%sts')).normalize(percents=False).to_text() == '/%te%sts' - assert URL(path=('', '%te%sts')).normalize().to_text() == '/%25te%25sts' - - percenty_url = URL(scheme='ftp', path=['%%%', '%a%b'], query=[('%', '%%')], fragment='%', userinfo='%:%') - - assert percenty_url.to_text(with_password=True) == 'ftp://%:%@/%%%/%a%b?%=%%#%' - assert percenty_url.normalize().to_text(with_password=True) == 'ftp://%25:%25@/%25%25%25/%25a%25b?%25=%25%25#%25' - - def test_str(self): - # see also issue #49 - text = u'http://example.com/á/y%20a%20y/?b=%25' - url = URL.from_text(text) - assert unicode(url) == text - assert bytes(url) == b'http://example.com/%C3%A1/y%20a%20y/?b=%25' - - if PY2: - assert isinstance(str(url), bytes) - assert isinstance(unicode(url), unicode) - else: - assert isinstance(str(url), unicode) - assert isinstance(bytes(url), bytes) - - def test_idna_corners(self): - text = u'http://abé.com/' - url = URL.from_text(text) - assert url.to_iri().host == u'abé.com' - assert url.to_uri().host == u'xn--ab-cja.com' - - url = URL.from_text("http://ドメイン.テスト.co.jp#test") - assert url.to_iri().host == u'ドメイン.テスト.co.jp' - assert url.to_uri().host == u'xn--eckwd4c7c.xn--zckzah.co.jp' - - assert url.to_uri().get_decoded_url().host == u'ドメイン.テスト.co.jp' - - assert URL.from_text('http://Example.com').to_uri().get_decoded_url().host == 'example.com' diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..e7efe6ae --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,10 @@ +[build-system] + +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" + + +[tool.black] + +line-length = 80 +target-version = ["py27"] diff --git a/requirements-test.txt b/requirements-test.txt deleted file mode 100644 index 0e9a261b..00000000 --- a/requirements-test.txt +++ /dev/null @@ -1,5 +0,0 @@ -coverage==4.4.1 -idna==2.5 -pytest==2.9.2 -pytest-cov==2.3.0 -tox==2.6.0 diff --git a/setup.py b/setup.py index 71cd4b18..f057fb8a 100644 --- a/setup.py +++ b/setup.py @@ -2,46 +2,56 @@ are you've used several just to read this text. Hyperlink is a featureful, pure-Python implementation of the URL, with -an emphasis on correctness. BSD licensed. +an emphasis on correctness. MIT licensed. See the docs at http://hyperlink.readthedocs.io. """ -from setuptools import setup - - -__author__ = 'Mahmoud Hashemi and Glyph Lefkowitz' -__version__ = '19.0.1dev' -__contact__ = 'mahmoud@hatnote.com' -__url__ = 'https://github.com/python-hyper/hyperlink' -__license__ = 'MIT' - - -setup(name='hyperlink', - version=__version__, - description="A featureful, immutable, and correct URL for Python.", - long_description=__doc__, - author=__author__, - author_email=__contact__, - url=__url__, - packages=['hyperlink', 'hyperlink.test'], - include_package_data=True, - zip_safe=False, - license=__license__, - platforms='any', - install_requires=['idna>=2.5'], - classifiers=[ - 'Topic :: Utilities', - 'Intended Audience :: Developers', - 'Topic :: Software Development :: Libraries', - 'Development Status :: 5 - Production/Stable', - 'Programming Language :: Python :: 2.6', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: Implementation :: PyPy', ] - ) +from setuptools import find_packages, setup + + +__author__ = "Mahmoud Hashemi and Glyph Lefkowitz" +__version__ = "21.0.1dev" +__contact__ = "mahmoud@hatnote.com" +__url__ = "https://github.com/python-hyper/hyperlink" +__license__ = "MIT" + + +setup( + name="hyperlink", + version=__version__, + description="A featureful, immutable, and correct URL for Python.", + long_description=__doc__, + author=__author__, + author_email=__contact__, + url=__url__, + packages=find_packages(where="src"), + package_dir={"": "src"}, + package_data=dict(hyperlink=["py.typed", "idna-tables-properties.csv.gz"]), + zip_safe=False, + license=__license__, + platforms="any", + install_requires=["idna>=2.5", 'typing ; python_version<"3.5"'], + python_requires=">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*", + classifiers=[ + "Topic :: Utilities", + "Intended Audience :: Developers", + "Topic :: Software Development :: Libraries", + "Development Status :: 5 - Production/Stable", + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 2.6", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: Implementation :: PyPy", + "License :: OSI Approved :: MIT License", + ], +) """ A brief checklist for release: diff --git a/src/hyperlink/__init__.py b/src/hyperlink/__init__.py new file mode 100644 index 00000000..f680b01a --- /dev/null +++ b/src/hyperlink/__init__.py @@ -0,0 +1,17 @@ +from ._url import ( + parse, + register_scheme, + URL, + EncodedURL, + DecodedURL, + URLParseError, +) + +__all__ = ( + "parse", + "register_scheme", + "URL", + "EncodedURL", + "DecodedURL", + "URLParseError", +) diff --git a/src/hyperlink/_socket.py b/src/hyperlink/_socket.py new file mode 100644 index 00000000..3bcf8970 --- /dev/null +++ b/src/hyperlink/_socket.py @@ -0,0 +1,53 @@ +try: + from socket import inet_pton +except ImportError: + from typing import TYPE_CHECKING + + if TYPE_CHECKING: # pragma: no cover + pass + else: + # based on https://gist.github.com/nnemkin/4966028 + # this code only applies on Windows Python 2.7 + import ctypes + import socket + + class SockAddr(ctypes.Structure): + _fields_ = [ + ("sa_family", ctypes.c_short), + ("__pad1", ctypes.c_ushort), + ("ipv4_addr", ctypes.c_byte * 4), + ("ipv6_addr", ctypes.c_byte * 16), + ("__pad2", ctypes.c_ulong), + ] + + WSAStringToAddressA = ctypes.windll.ws2_32.WSAStringToAddressA + WSAAddressToStringA = ctypes.windll.ws2_32.WSAAddressToStringA + + def inet_pton(address_family, ip_string): + # type: (int, str) -> bytes + addr = SockAddr() + ip_string_bytes = ip_string.encode("ascii") + addr.sa_family = address_family + addr_size = ctypes.c_int(ctypes.sizeof(addr)) + + try: + attribute, size = { + socket.AF_INET: ("ipv4_addr", 4), + socket.AF_INET6: ("ipv6_addr", 16), + }[address_family] + except KeyError: + raise socket.error("unknown address family") + + if ( + WSAStringToAddressA( + ip_string_bytes, + address_family, + None, + ctypes.byref(addr), + ctypes.byref(addr_size), + ) + != 0 + ): + raise socket.error(ctypes.FormatError()) + + return ctypes.string_at(getattr(addr, attribute), size) diff --git a/src/hyperlink/_url.py b/src/hyperlink/_url.py new file mode 100644 index 00000000..8797b5cc --- /dev/null +++ b/src/hyperlink/_url.py @@ -0,0 +1,2472 @@ +# -*- coding: utf-8 -*- +u"""Hyperlink provides Pythonic URL parsing, construction, and rendering. + +Usage is straightforward:: + + >>> import hyperlink + >>> url = hyperlink.parse(u'http://github.com/mahmoud/hyperlink?utm_source=docs') + >>> url.host + u'github.com' + >>> secure_url = url.replace(scheme=u'https') + >>> secure_url.get('utm_source')[0] + u'docs' + +Hyperlink's API centers on the :class:`DecodedURL` type, which wraps +the lower-level :class:`URL`, both of which can be returned by the +:func:`parse()` convenience function. + +""" # noqa: E501 + +import re +import sys +import string +import socket +from socket import AF_INET, AF_INET6 + +try: + from socket import AddressFamily +except ImportError: + AddressFamily = int # type: ignore[assignment,misc] +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + List, + Mapping, + Optional, + Sequence, + Text, + Tuple, + Type, + TypeVar, + Union, + cast, + TYPE_CHECKING, + overload, +) +from unicodedata import normalize +from ._socket import inet_pton + +try: + from collections.abc import Mapping as MappingABC +except ImportError: # Python 2 + from collections import Mapping as MappingABC + +from idna import encode as idna_encode, decode as idna_decode + + +PY2 = sys.version_info[0] == 2 +try: + unichr +except NameError: # Py3 + unichr = chr # type: Callable[[int], Text] +NoneType = type(None) # type: Type[None] +QueryPairs = Tuple[Tuple[Text, Optional[Text]], ...] # internal representation +QueryParameters = Union[ + Mapping[Text, Optional[Text]], + QueryPairs, + Iterable[Tuple[Text, Optional[Text]]], +] +T = TypeVar("T") +# Literal is not available in all pythons so we only bring it in for mypy. +if TYPE_CHECKING: + from typing import Literal + + +# from boltons.typeutils +def make_sentinel(name="_MISSING", var_name=""): + # type: (str, str) -> object + """Creates and returns a new **instance** of a new class, suitable for + usage as a "sentinel", a kind of singleton often used to indicate + a value is missing when ``None`` is a valid input. + + Args: + name: Name of the Sentinel + var_name: Set this name to the name of the variable in its respective + module enable pickle-ability. + + >>> make_sentinel(var_name='_MISSING') + _MISSING + + The most common use cases here in boltons are as default values + for optional function arguments, partly because of its + less-confusing appearance in automatically generated + documentation. Sentinels also function well as placeholders in queues + and linked lists. + + .. note:: + + By design, additional calls to ``make_sentinel`` with the same + values will not produce equivalent objects. + + >>> make_sentinel('TEST') == make_sentinel('TEST') + False + >>> type(make_sentinel('TEST')) == type(make_sentinel('TEST')) + False + """ + + class Sentinel(object): + def __init__(self): + # type: () -> None + self.name = name + self.var_name = var_name + + def __repr__(self): + # type: () -> str + if self.var_name: + return self.var_name + return "%s(%r)" % (self.__class__.__name__, self.name) + + if var_name: + # superclass type hints don't allow str return type, but it is + # allowed in the docs, hence the ignore[override] below + def __reduce__(self): + # type: () -> str + return self.var_name + + def __nonzero__(self): + # type: () -> bool + return False + + __bool__ = __nonzero__ + + return Sentinel() + + +_unspecified = _UNSET = make_sentinel("_UNSET") # type: Any + + +# RFC 3986 Section 2.3, Unreserved URI Characters +# https://tools.ietf.org/html/rfc3986#section-2.3 +_UNRESERVED_CHARS = frozenset( + "~-._0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" +) + + +# URL parsing regex (based on RFC 3986 Appendix B, with modifications) +_URL_RE = re.compile( + r"^((?P[^:/?#]+):)?" + r"((?P<_netloc_sep>//)" + r"(?P[^/?#]*))?" + r"(?P[^?#]*)" + r"(\?(?P[^#]*))?" + r"(#(?P.*))?$" +) +_SCHEME_RE = re.compile(r"^[a-zA-Z0-9+-.]*$") +_AUTHORITY_RE = re.compile( + r"^(?:(?P[^@/?#]*)@)?" + r"(?P" + r"(?:\[(?P[^[\]/?#]*)\])" + r"|(?P[^:/?#[\]]*)" + r"|(?P.*?))?" + r"(?::(?P.*))?$" +) + + +_HEX_CHAR_MAP = dict( + [ + ((a + b).encode("ascii"), unichr(int(a + b, 16)).encode("charmap")) + for a in string.hexdigits + for b in string.hexdigits + ] +) +_ASCII_RE = re.compile("([\x00-\x7f]+)") + +# RFC 3986 section 2.2, Reserved Characters +# https://tools.ietf.org/html/rfc3986#section-2.2 +_GEN_DELIMS = frozenset(u":/?#[]@") +_SUB_DELIMS = frozenset(u"!$&'()*+,;=") +_ALL_DELIMS = _GEN_DELIMS | _SUB_DELIMS + +_USERINFO_SAFE = _UNRESERVED_CHARS | _SUB_DELIMS | set(u"%") +_USERINFO_DELIMS = _ALL_DELIMS - _USERINFO_SAFE +_PATH_SAFE = _USERINFO_SAFE | set(u":@") +_PATH_DELIMS = _ALL_DELIMS - _PATH_SAFE +_SCHEMELESS_PATH_SAFE = _PATH_SAFE - set(":") +_SCHEMELESS_PATH_DELIMS = _ALL_DELIMS - _SCHEMELESS_PATH_SAFE +_FRAGMENT_SAFE = _UNRESERVED_CHARS | _PATH_SAFE | set(u"/?") +_FRAGMENT_DELIMS = _ALL_DELIMS - _FRAGMENT_SAFE +_QUERY_VALUE_SAFE = _UNRESERVED_CHARS | _FRAGMENT_SAFE - set(u"&") +_QUERY_VALUE_DELIMS = _ALL_DELIMS - _QUERY_VALUE_SAFE +_QUERY_KEY_SAFE = _UNRESERVED_CHARS | _QUERY_VALUE_SAFE - set(u"=") +_QUERY_KEY_DELIMS = _ALL_DELIMS - _QUERY_KEY_SAFE + + +def _make_decode_map(delims, allow_percent=False): + # type: (Iterable[Text], bool) -> Mapping[bytes, bytes] + ret = dict(_HEX_CHAR_MAP) + if not allow_percent: + delims = set(delims) | set([u"%"]) + for delim in delims: + _hexord = "{0:02X}".format(ord(delim)).encode("ascii") + _hexord_lower = _hexord.lower() + ret.pop(_hexord) + if _hexord != _hexord_lower: + ret.pop(_hexord_lower) + return ret + + +def _make_quote_map(safe_chars): + # type: (Iterable[Text]) -> Mapping[Union[int, Text], Text] + ret = {} # type: Dict[Union[int, Text], Text] + # v is included in the dict for py3 mostly, because bytestrings + # are iterables of ints, of course! + for i, v in zip(range(256), range(256)): + c = chr(v) + if c in safe_chars: + ret[c] = ret[v] = c + else: + ret[c] = ret[v] = "%{0:02X}".format(i) + return ret + + +_USERINFO_PART_QUOTE_MAP = _make_quote_map(_USERINFO_SAFE) +_USERINFO_DECODE_MAP = _make_decode_map(_USERINFO_DELIMS) +_PATH_PART_QUOTE_MAP = _make_quote_map(_PATH_SAFE) +_SCHEMELESS_PATH_PART_QUOTE_MAP = _make_quote_map(_SCHEMELESS_PATH_SAFE) +_PATH_DECODE_MAP = _make_decode_map(_PATH_DELIMS) +_QUERY_KEY_QUOTE_MAP = _make_quote_map(_QUERY_KEY_SAFE) +_QUERY_KEY_DECODE_MAP = _make_decode_map(_QUERY_KEY_DELIMS) +_QUERY_VALUE_QUOTE_MAP = _make_quote_map(_QUERY_VALUE_SAFE) +_QUERY_VALUE_DECODE_MAP = _make_decode_map(_QUERY_VALUE_DELIMS | set("+")) +_FRAGMENT_QUOTE_MAP = _make_quote_map(_FRAGMENT_SAFE) +_FRAGMENT_DECODE_MAP = _make_decode_map(_FRAGMENT_DELIMS) +_UNRESERVED_QUOTE_MAP = _make_quote_map(_UNRESERVED_CHARS) +_UNRESERVED_DECODE_MAP = dict( + [ + (k, v) + for k, v in _HEX_CHAR_MAP.items() + if v.decode("ascii", "replace") in _UNRESERVED_CHARS + ] +) + +_ROOT_PATHS = frozenset(((), (u"",))) + + +def _encode_reserved(text, maximal=True): + # type: (Text, bool) -> Text + """A very comprehensive percent encoding for encoding all + delimiters. Used for arguments to DecodedURL, where a % means a + percent sign, and not the character used by URLs for escaping + bytes. + """ + if maximal: + bytestr = normalize("NFC", text).encode("utf8") + return u"".join([_UNRESERVED_QUOTE_MAP[b] for b in bytestr]) + return u"".join( + [ + _UNRESERVED_QUOTE_MAP[t] if t in _UNRESERVED_CHARS else t + for t in text + ] + ) + + +def _encode_path_part(text, maximal=True): + # type: (Text, bool) -> Text + "Percent-encode a single segment of a URL path." + if maximal: + bytestr = normalize("NFC", text).encode("utf8") + return u"".join([_PATH_PART_QUOTE_MAP[b] for b in bytestr]) + return u"".join( + [_PATH_PART_QUOTE_MAP[t] if t in _PATH_DELIMS else t for t in text] + ) + + +def _encode_schemeless_path_part(text, maximal=True): + # type: (Text, bool) -> Text + """Percent-encode the first segment of a URL path for a URL without a + scheme specified. + """ + if maximal: + bytestr = normalize("NFC", text).encode("utf8") + return u"".join([_SCHEMELESS_PATH_PART_QUOTE_MAP[b] for b in bytestr]) + return u"".join( + [ + _SCHEMELESS_PATH_PART_QUOTE_MAP[t] + if t in _SCHEMELESS_PATH_DELIMS + else t + for t in text + ] + ) + + +def _encode_path_parts( + text_parts, # type: Sequence[Text] + rooted=False, # type: bool + has_scheme=True, # type: bool + has_authority=True, # type: bool + maximal=True, # type: bool +): + # type: (...) -> Sequence[Text] + """ + Percent-encode a tuple of path parts into a complete path. + + Setting *maximal* to False percent-encodes only the reserved + characters that are syntactically necessary for serialization, + preserving any IRI-style textual data. + + Leaving *maximal* set to its default True percent-encodes + everything required to convert a portion of an IRI to a portion of + a URI. + + RFC 3986 3.3: + + If a URI contains an authority component, then the path component + must either be empty or begin with a slash ("/") character. If a URI + does not contain an authority component, then the path cannot begin + with two slash characters ("//"). In addition, a URI reference + (Section 4.1) may be a relative-path reference, in which case the + first path segment cannot contain a colon (":") character. + """ + if not text_parts: + return () + if rooted: + text_parts = (u"",) + tuple(text_parts) + # elif has_authority and text_parts: + # raise Exception('see rfc above') # TODO: too late to fail like this? + encoded_parts = [] # type: List[Text] + if has_scheme: + encoded_parts = [ + _encode_path_part(part, maximal=maximal) if part else part + for part in text_parts + ] + else: + encoded_parts = [_encode_schemeless_path_part(text_parts[0])] + encoded_parts.extend( + [ + _encode_path_part(part, maximal=maximal) if part else part + for part in text_parts[1:] + ] + ) + return tuple(encoded_parts) + + +def _encode_query_key(text, maximal=True): + # type: (Text, bool) -> Text + """ + Percent-encode a single query string key or value. + """ + if maximal: + bytestr = normalize("NFC", text).encode("utf8") + return u"".join([_QUERY_KEY_QUOTE_MAP[b] for b in bytestr]) + return u"".join( + [_QUERY_KEY_QUOTE_MAP[t] if t in _QUERY_KEY_DELIMS else t for t in text] + ) + + +def _encode_query_value(text, maximal=True): + # type: (Text, bool) -> Text + """ + Percent-encode a single query string key or value. + """ + if maximal: + bytestr = normalize("NFC", text).encode("utf8") + return u"".join([_QUERY_VALUE_QUOTE_MAP[b] for b in bytestr]) + return u"".join( + [ + _QUERY_VALUE_QUOTE_MAP[t] if t in _QUERY_VALUE_DELIMS else t + for t in text + ] + ) + + +def _encode_fragment_part(text, maximal=True): + # type: (Text, bool) -> Text + """Quote the fragment part of the URL. Fragments don't have + subdelimiters, so the whole URL fragment can be passed. + """ + if maximal: + bytestr = normalize("NFC", text).encode("utf8") + return u"".join([_FRAGMENT_QUOTE_MAP[b] for b in bytestr]) + return u"".join( + [_FRAGMENT_QUOTE_MAP[t] if t in _FRAGMENT_DELIMS else t for t in text] + ) + + +def _encode_userinfo_part(text, maximal=True): + # type: (Text, bool) -> Text + """Quote special characters in either the username or password + section of the URL. + """ + if maximal: + bytestr = normalize("NFC", text).encode("utf8") + return u"".join([_USERINFO_PART_QUOTE_MAP[b] for b in bytestr]) + return u"".join( + [ + _USERINFO_PART_QUOTE_MAP[t] if t in _USERINFO_DELIMS else t + for t in text + ] + ) + + +# This port list painstakingly curated by hand searching through +# https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml +# and +# https://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xhtml +SCHEME_PORT_MAP = { + "acap": 674, + "afp": 548, + "dict": 2628, + "dns": 53, + "file": None, + "ftp": 21, + "git": 9418, + "gopher": 70, + "http": 80, + "https": 443, + "imap": 143, + "ipp": 631, + "ipps": 631, + "irc": 194, + "ircs": 6697, + "ldap": 389, + "ldaps": 636, + "mms": 1755, + "msrp": 2855, + "msrps": None, + "mtqp": 1038, + "nfs": 111, + "nntp": 119, + "nntps": 563, + "pop": 110, + "prospero": 1525, + "redis": 6379, + "rsync": 873, + "rtsp": 554, + "rtsps": 322, + "rtspu": 5005, + "sftp": 22, + "smb": 445, + "snmp": 161, + "ssh": 22, + "steam": None, + "svn": 3690, + "telnet": 23, + "ventrilo": 3784, + "vnc": 5900, + "wais": 210, + "ws": 80, + "wss": 443, + "xmpp": None, +} + +# This list of schemes that don't use authorities is also from the link above. +NO_NETLOC_SCHEMES = set( + [ + "urn", + "about", + "bitcoin", + "blob", + "data", + "geo", + "magnet", + "mailto", + "news", + "pkcs11", + "sip", + "sips", + "tel", + ] +) +# As of Mar 11, 2017, there were 44 netloc schemes, and 13 non-netloc + +NO_QUERY_PLUS_SCHEMES = set() + + +def register_scheme( + text, uses_netloc=True, default_port=None, query_plus_is_space=True +): + # type: (Text, bool, Optional[int], bool) -> None + """Registers new scheme information, resulting in correct port and + slash behavior from the URL object. There are dozens of standard + schemes preregistered, so this function is mostly meant for + proprietary internal customizations or stopgaps on missing + standards information. If a scheme seems to be missing, please + `file an issue`_! + + Args: + text: A string representation of the scheme. + (the 'http' in 'http://hatnote.com') + uses_netloc: Does the scheme support specifying a + network host? For instance, "http" does, "mailto" does + not. Defaults to True. + default_port: The default port, if any, for + netloc-using schemes. + query_plus_is_space: If true, a "+" in the query string should be + decoded as a space by DecodedURL. + + .. _file an issue: https://github.com/mahmoud/hyperlink/issues + """ + text = text.lower() + if default_port is not None: + try: + default_port = int(default_port) + except (ValueError, TypeError): + raise ValueError( + "default_port expected integer or None, not %r" + % (default_port,) + ) + + if uses_netloc is True: + SCHEME_PORT_MAP[text] = default_port + elif uses_netloc is False: + if default_port is not None: + raise ValueError( + "unexpected default port while specifying" + " non-netloc scheme: %r" % default_port + ) + NO_NETLOC_SCHEMES.add(text) + else: + raise ValueError("uses_netloc expected bool, not: %r" % uses_netloc) + + if not query_plus_is_space: + NO_QUERY_PLUS_SCHEMES.add(text) + + return + + +def scheme_uses_netloc(scheme, default=None): + # type: (Text, Optional[bool]) -> Optional[bool] + """Whether or not a URL uses :code:`:` or :code:`://` to separate the + scheme from the rest of the URL depends on the scheme's own + standard definition. There is no way to infer this behavior + from other parts of the URL. A scheme either supports network + locations or it does not. + + The URL type's approach to this is to check for explicitly + registered schemes, with common schemes like HTTP + preregistered. This is the same approach taken by + :mod:`urlparse`. + + URL adds two additional heuristics if the scheme as a whole is + not registered. First, it attempts to check the subpart of the + scheme after the last ``+`` character. This adds intuitive + behavior for schemes like ``git+ssh``. Second, if a URL with + an unrecognized scheme is loaded, it will maintain the + separator it sees. + """ + if not scheme: + return False + scheme = scheme.lower() + if scheme in SCHEME_PORT_MAP: + return True + if scheme in NO_NETLOC_SCHEMES: + return False + if scheme.split("+")[-1] in SCHEME_PORT_MAP: + return True + return default + + +class URLParseError(ValueError): + """Exception inheriting from :exc:`ValueError`, raised when failing to + parse a URL. Mostly raised on invalid ports and IPv6 addresses. + """ + + pass + + +def _optional(argument, default): + # type: (Any, Any) -> Any + if argument is _UNSET: + return default + else: + return argument + + +def _typecheck(name, value, *types): + # type: (Text, T, Type[Any]) -> T + """ + Check that the given *value* is one of the given *types*, or raise an + exception describing the problem using *name*. + """ + if not types: + raise ValueError("expected one or more types, maybe use _textcheck?") + if not isinstance(value, types): + raise TypeError( + "expected %s for %s, got %r" + % (" or ".join([t.__name__ for t in types]), name, value) + ) + return value + + +def _textcheck(name, value, delims=frozenset(), nullable=False): + # type: (Text, T, Iterable[Text], bool) -> T + if not isinstance(value, Text): + if nullable and value is None: + # used by query string values + return value # type: ignore[unreachable] + else: + str_name = "unicode" if PY2 else "str" + exp = str_name + " or NoneType" if nullable else str_name + raise TypeError("expected %s for %s, got %r" % (exp, name, value)) + if delims and set(value) & set(delims): # TODO: test caching into regexes + raise ValueError( + "one or more reserved delimiters %s present in %s: %r" + % ("".join(delims), name, value) + ) + return value # type: ignore[return-value] # T vs. Text + + +def iter_pairs(iterable): + # type: (Iterable[Any]) -> Iterator[Any] + """ + Iterate over the (key, value) pairs in ``iterable``. + + This handles dictionaries sensibly, and falls back to assuming the + iterable yields (key, value) pairs. This behaviour is similar to + what Python's ``dict()`` constructor does. + """ + if isinstance(iterable, MappingABC): + iterable = iterable.items() + return iter(iterable) + + +def _decode_unreserved(text, normalize_case=False, encode_stray_percents=False): + # type: (Text, bool, bool) -> Text + return _percent_decode( + text, + normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, + _decode_map=_UNRESERVED_DECODE_MAP, + ) + + +def _decode_userinfo_part( + text, normalize_case=False, encode_stray_percents=False +): + # type: (Text, bool, bool) -> Text + return _percent_decode( + text, + normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, + _decode_map=_USERINFO_DECODE_MAP, + ) + + +def _decode_path_part(text, normalize_case=False, encode_stray_percents=False): + # type: (Text, bool, bool) -> Text + """ + >>> _decode_path_part(u'%61%77%2f%7a') + u'aw%2fz' + >>> _decode_path_part(u'%61%77%2f%7a', normalize_case=True) + u'aw%2Fz' + """ + return _percent_decode( + text, + normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, + _decode_map=_PATH_DECODE_MAP, + ) + + +def _decode_query_key(text, normalize_case=False, encode_stray_percents=False): + # type: (Text, bool, bool) -> Text + return _percent_decode( + text, + normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, + _decode_map=_QUERY_KEY_DECODE_MAP, + ) + + +def _decode_query_value( + text, normalize_case=False, encode_stray_percents=False +): + # type: (Text, bool, bool) -> Text + return _percent_decode( + text, + normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, + _decode_map=_QUERY_VALUE_DECODE_MAP, + ) + + +def _decode_fragment_part( + text, normalize_case=False, encode_stray_percents=False +): + # type: (Text, bool, bool) -> Text + return _percent_decode( + text, + normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, + _decode_map=_FRAGMENT_DECODE_MAP, + ) + + +def _percent_decode( + text, # type: Text + normalize_case=False, # type: bool + subencoding="utf-8", # type: Text + raise_subencoding_exc=False, # type: bool + encode_stray_percents=False, # type: bool + _decode_map=_HEX_CHAR_MAP, # type: Mapping[bytes, bytes] +): + # type: (...) -> Text + """Convert percent-encoded text characters to their normal, + human-readable equivalents. + + All characters in the input text must be encodable by + *subencoding*. All special characters underlying the values in the + percent-encoding must be decodable as *subencoding*. If a + non-*subencoding*-valid string is passed, the original text is + returned with no changes applied. + + Only called by field-tailored variants, e.g., + :func:`_decode_path_part`, as every percent-encodable part of the + URL has characters which should not be percent decoded. + + >>> _percent_decode(u'abc%20def') + u'abc def' + + Args: + text: Text with percent-encoding present. + normalize_case: Whether undecoded percent segments, such as encoded + delimiters, should be uppercased, per RFC 3986 Section 2.1. + See :func:`_decode_path_part` for an example. + subencoding: The name of the encoding underlying the percent-encoding. + raise_subencoding_exc: Whether an error in decoding the bytes + underlying the percent-decoding should be raised. + + Returns: + Text: The percent-decoded version of *text*, decoded by *subencoding*. + """ + try: + quoted_bytes = text.encode(subencoding) + except UnicodeEncodeError: + return text + + bits = quoted_bytes.split(b"%") + if len(bits) == 1: + return text + + res = [bits[0]] + append = res.append + + for item in bits[1:]: + hexpair, rest = item[:2], item[2:] + try: + append(_decode_map[hexpair]) + append(rest) + except KeyError: + pair_is_hex = hexpair in _HEX_CHAR_MAP + if pair_is_hex or not encode_stray_percents: + append(b"%") + else: + # if it's undecodable, treat as a real percent sign, + # which is reserved (because it wasn't in the + # context-aware _decode_map passed in), and should + # stay in an encoded state. + append(b"%25") + if normalize_case and pair_is_hex: + append(hexpair.upper()) + append(rest) + else: + append(item) + + unquoted_bytes = b"".join(res) + + try: + return unquoted_bytes.decode(subencoding) + except UnicodeDecodeError: + if raise_subencoding_exc: + raise + return text + + +def _decode_host(host): + # type: (Text) -> Text + """Decode a host from ASCII-encodable text to IDNA-decoded text. If + the host text is not ASCII, it is returned unchanged, as it is + presumed that it is already IDNA-decoded. + + Some technical details: _decode_host is built on top of the "idna" + package, which has some quirks: + + Capital letters are not valid IDNA2008. The idna package will + raise an exception like this on capital letters: + + > idna.core.InvalidCodepoint: Codepoint U+004B at position 1 ... not allowed + + However, if a segment of a host (i.e., something in + url.host.split('.')) is already ASCII, idna doesn't perform its + usual checks. In fact, for capital letters it automatically + lowercases them. + + This check and some other functionality can be bypassed by passing + uts46=True to idna.encode/decode. This allows a more permissive and + convenient interface. So far it seems like the balanced approach. + + Example output (from idna==2.6): + + >> idna.encode(u'mahmöud.io') + 'xn--mahmud-zxa.io' + >> idna.encode(u'Mahmöud.io') + Traceback (most recent call last): + File "", line 1, in + File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 355, in encode + result.append(alabel(label)) + File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 276, in alabel + check_label(label) + File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 253, in check_label + raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label))) + idna.core.InvalidCodepoint: Codepoint U+004D at position 1 of u'Mahm\xf6ud' not allowed + >> idna.encode(u'Mahmoud.io') + 'Mahmoud.io' + + # Similar behavior for decodes below + >> idna.decode(u'Mahmoud.io') + u'mahmoud.io + >> idna.decode(u'Méhmoud.io', uts46=True) + u'm\xe9hmoud.io' + """ # noqa: E501 + if not host: + return u"" + try: + host_bytes = host.encode("ascii") + except UnicodeEncodeError: + host_text = host + else: + try: + host_text = idna_decode(host_bytes, uts46=True) + except ValueError: + # only reached on "narrow" (UCS-2) Python builds <3.4, see #7 + # NOTE: not going to raise here, because there's no + # ambiguity in the IDNA, and the host is still + # technically usable + host_text = host + return host_text + + +def _resolve_dot_segments(path): + # type: (Sequence[Text]) -> Sequence[Text] + """Normalize the URL path by resolving segments of '.' and '..'. For + more details, see `RFC 3986 section 5.2.4, Remove Dot Segments`_. + + Args: + path: sequence of path segments in text form + + Returns: + A new sequence of path segments with the '.' and '..' elements removed + and resolved. + + .. _RFC 3986 section 5.2.4, Remove Dot Segments: https://tools.ietf.org/html/rfc3986#section-5.2.4 + """ # noqa: E501 + segs = [] # type: List[Text] + + for seg in path: + if seg == u".": + pass + elif seg == u"..": + if segs: + segs.pop() + else: + segs.append(seg) + + if list(path[-1:]) in ([u"."], [u".."]): + segs.append(u"") + + return segs + + +def parse_host(host): + # type: (Text) -> Tuple[Optional[AddressFamily], Text] + """Parse the host into a tuple of ``(family, host)``, where family + is the appropriate :mod:`socket` module constant when the host is + an IP address. Family is ``None`` when the host is not an IP. + + Will raise :class:`URLParseError` on invalid IPv6 constants. + + Returns: + family (socket constant or None), host (string) + + >>> import socket + >>> parse_host('googlewebsite.com') == (None, 'googlewebsite.com') + True + >>> parse_host('::1') == (socket.AF_INET6, '::1') + True + >>> parse_host('192.168.1.1') == (socket.AF_INET, '192.168.1.1') + True + """ + if not host: + return None, u"" + + if u":" in host: + try: + inet_pton(AF_INET6, host) + except socket.error as se: + raise URLParseError("invalid IPv6 host: %r (%r)" % (host, se)) + except UnicodeEncodeError: + pass # TODO: this can't be a real host right? + else: + family = AF_INET6 # type: Optional[AddressFamily] + else: + try: + inet_pton(AF_INET, host) + except (socket.error, UnicodeEncodeError): + family = None # not an IP + else: + family = AF_INET + + return family, host + + +class URL(object): + r"""From blogs to billboards, URLs are so common, that it's easy to + overlook their complexity and power. With hyperlink's + :class:`URL` type, working with URLs doesn't have to be hard. + + URLs are made of many parts. Most of these parts are officially + named in `RFC 3986`_ and this diagram may prove handy in identifying + them:: + + foo://user:pass@example.com:8042/over/there?name=ferret#nose + \_/ \_______/ \_________/ \__/\_________/ \_________/ \__/ + | | | | | | | + scheme userinfo host port path query fragment + + While :meth:`~URL.from_text` is used for parsing whole URLs, the + :class:`URL` constructor builds a URL from the individual + components, like so:: + + >>> from hyperlink import URL + >>> url = URL(scheme=u'https', host=u'example.com', path=[u'hello', u'world']) + >>> print(url.to_text()) + https://example.com/hello/world + + The constructor runs basic type checks. All strings are expected + to be text (:class:`str` in Python 3, :class:`unicode` in Python 2). All + arguments are optional, defaulting to appropriately empty values. A full + list of constructor arguments is below. + + Args: + scheme: The text name of the scheme. + host: The host portion of the network location + port: The port part of the network location. If ``None`` or no port is + passed, the port will default to the default port of the scheme, if + it is known. See the ``SCHEME_PORT_MAP`` and + :func:`register_default_port` for more info. + path: A tuple of strings representing the slash-separated parts of the + path, each percent-encoded. + query: The query parameters, as a dictionary or as an sequence of + percent-encoded key-value pairs. + fragment: The fragment part of the URL. + rooted: A rooted URL is one which indicates an absolute path. + This is True on any URL that includes a host, or any relative URL + that starts with a slash. + userinfo: The username or colon-separated username:password pair. + uses_netloc: Indicates whether ``://`` (the "netloc separator") will + appear to separate the scheme from the *path* in cases where no + host is present. + Setting this to ``True`` is a non-spec-compliant affordance for the + common practice of having URIs that are *not* URLs (cannot have a + 'host' part) but nevertheless use the common ``://`` idiom that + most people associate with URLs; e.g. ``message:`` URIs like + ``message://message-id`` being equivalent to ``message:message-id``. + This may be inferred based on the scheme depending on whether + :func:`register_scheme` has been used to register the scheme and + should not be passed directly unless you know the scheme works like + this and you know it has not been registered. + + All of these parts are also exposed as read-only attributes of :class:`URL` + instances, along with several useful methods. + + .. _RFC 3986: https://tools.ietf.org/html/rfc3986 + .. _RFC 3987: https://tools.ietf.org/html/rfc3987 + """ # noqa: E501 + + def __init__( + self, + scheme=None, # type: Optional[Text] + host=None, # type: Optional[Text] + path=(), # type: Iterable[Text] + query=(), # type: QueryParameters + fragment=u"", # type: Text + port=None, # type: Optional[int] + rooted=None, # type: Optional[bool] + userinfo=u"", # type: Text + uses_netloc=None, # type: Optional[bool] + ): + # type: (...) -> None + if host is not None and scheme is None: + scheme = u"http" # TODO: why + if port is None and scheme is not None: + port = SCHEME_PORT_MAP.get(scheme) + if host and query and not path: + # per RFC 3986 6.2.3, "a URI that uses the generic syntax + # for authority with an empty path should be normalized to + # a path of '/'." + path = (u"",) + + # Now that we're done detecting whether they were passed, we can set + # them to their defaults: + if scheme is None: + scheme = u"" + if host is None: + host = u"" + if rooted is None: + rooted = bool(host) + + # Set attributes. + self._scheme = _textcheck("scheme", scheme) + if self._scheme: + if not _SCHEME_RE.match(self._scheme): + raise ValueError( + 'invalid scheme: %r. Only alphanumeric, "+",' + ' "-", and "." allowed. Did you meant to call' + " %s.from_text()?" % (self._scheme, self.__class__.__name__) + ) + + _, self._host = parse_host(_textcheck("host", host, "/?#@")) + if isinstance(path, Text): + raise TypeError( + "expected iterable of text for path, not: %r" % (path,) + ) + self._path = tuple( + (_textcheck("path segment", segment, "/?#") for segment in path) + ) + self._query = tuple( + ( + _textcheck("query parameter name", k, "&=#"), + _textcheck("query parameter value", v, "&#", nullable=True), + ) + for k, v in iter_pairs(query) + ) + self._fragment = _textcheck("fragment", fragment) + self._port = _typecheck("port", port, int, NoneType) + self._rooted = _typecheck("rooted", rooted, bool) + self._userinfo = _textcheck("userinfo", userinfo, "/?#@") + + if uses_netloc is None: + uses_netloc = scheme_uses_netloc(self._scheme, uses_netloc) + self._uses_netloc = _typecheck( + "uses_netloc", uses_netloc, bool, NoneType + ) + will_have_authority = self._host or ( + self._port and self._port != SCHEME_PORT_MAP.get(scheme) + ) + if will_have_authority: + # fixup for rooted consistency; if there's any 'authority' + # represented in the textual URL, then the path must be rooted, and + # we're definitely using a netloc (there must be a ://). + self._rooted = True + self._uses_netloc = True + if (not self._rooted) and self.path[:1] == (u"",): + self._rooted = True + self._path = self._path[1:] + if not will_have_authority and self._path and not self._rooted: + # If, after fixing up the path, there *is* a path and it *isn't* + # rooted, then we are definitely not using a netloc; if we did, it + # would make the path (erroneously) look like a hostname. + self._uses_netloc = False + + def get_decoded_url(self, lazy=False): + # type: (bool) -> DecodedURL + try: + return self._decoded_url + except AttributeError: + self._decoded_url = DecodedURL(self, lazy=lazy) # type: DecodedURL + return self._decoded_url + + @property + def scheme(self): + # type: () -> Text + """The scheme is a string, and the first part of an absolute URL, the + part before the first colon, and the part which defines the + semantics of the rest of the URL. Examples include "http", + "https", "ssh", "file", "mailto", and many others. See + :func:`~hyperlink.register_scheme()` for more info. + """ + return self._scheme + + @property + def host(self): + # type: () -> Text + """The host is a string, and the second standard part of an absolute + URL. When present, a valid host must be a domain name, or an + IP (v4 or v6). It occurs before the first slash, or the second + colon, if a :attr:`~hyperlink.URL.port` is provided. + """ + return self._host + + @property + def port(self): + # type: () -> Optional[int] + """The port is an integer that is commonly used in connecting to the + :attr:`host`, and almost never appears without it. + + When not present in the original URL, this attribute defaults + to the scheme's default port. If the scheme's default port is + not known, and the port is not provided, this attribute will + be set to None. + + >>> URL.from_text(u'http://example.com/pa/th').port + 80 + >>> URL.from_text(u'foo://example.com/pa/th').port + >>> URL.from_text(u'foo://example.com:8042/pa/th').port + 8042 + + .. note:: + + Per the standard, when the port is the same as the schemes + default port, it will be omitted in the text URL. + """ + return self._port + + @property + def path(self): + # type: () -> Sequence[Text] + """A tuple of strings, created by splitting the slash-separated + hierarchical path. Started by the first slash after the host, + terminated by a "?", which indicates the start of the + :attr:`~hyperlink.URL.query` string. + """ + return self._path + + @property + def query(self): + # type: () -> QueryPairs + """Tuple of pairs, created by splitting the ampersand-separated + mapping of keys and optional values representing + non-hierarchical data used to identify the resource. Keys are + always strings. Values are strings when present, or None when + missing. + + For more operations on the mapping, see + :meth:`~hyperlink.URL.get()`, :meth:`~hyperlink.URL.add()`, + :meth:`~hyperlink.URL.set()`, and + :meth:`~hyperlink.URL.delete()`. + """ + return self._query + + @property + def fragment(self): + # type: () -> Text + """A string, the last part of the URL, indicated by the first "#" + after the :attr:`~hyperlink.URL.path` or + :attr:`~hyperlink.URL.query`. Enables indirect identification + of a secondary resource, like an anchor within an HTML page. + """ + return self._fragment + + @property + def rooted(self): + # type: () -> bool + """Whether or not the path starts with a forward slash (``/``). + + This is taken from the terminology in the BNF grammar, + specifically the "path-rootless", rule, since "absolute path" + and "absolute URI" are somewhat ambiguous. :attr:`path` does + not contain the implicit prefixed ``"/"`` since that is + somewhat awkward to work with. + """ + return self._rooted + + @property + def userinfo(self): + # type: () -> Text + """The colon-separated string forming the username-password + combination. + """ + return self._userinfo + + @property + def uses_netloc(self): + # type: () -> Optional[bool] + """ + Indicates whether ``://`` (the "netloc separator") will appear to + separate the scheme from the *path* in cases where no host is present. + """ + return self._uses_netloc + + @property + def user(self): + # type: () -> Text + """ + The user portion of :attr:`~hyperlink.URL.userinfo`. + """ + return self.userinfo.split(u":")[0] + + def authority(self, with_password=False, **kw): + # type: (bool, Any) -> Text + """Compute and return the appropriate host/port/userinfo combination. + + >>> url = URL.from_text(u'http://user:pass@localhost:8080/a/b?x=y') + >>> url.authority() + u'user:@localhost:8080' + >>> url.authority(with_password=True) + u'user:pass@localhost:8080' + + Args: + with_password: Whether the return value of this method include the + password in the URL, if it is set. + Defaults to False. + + Returns: + Text: The authority (network location and user information) portion + of the URL. + """ + # first, a bit of twisted compat + with_password = kw.pop("includeSecrets", with_password) + if kw: + raise TypeError("got unexpected keyword arguments: %r" % kw.keys()) + host = self.host + if ":" in host: + hostport = ["[" + host + "]"] + else: + hostport = [self.host] + if self.port != SCHEME_PORT_MAP.get(self.scheme): + hostport.append(Text(self.port)) + authority = [] + if self.userinfo: + userinfo = self.userinfo + if not with_password and u":" in userinfo: + userinfo = userinfo[: userinfo.index(u":") + 1] + authority.append(userinfo) + authority.append(u":".join(hostport)) + return u"@".join(authority) + + def __eq__(self, other): + # type: (Any) -> bool + if not isinstance(other, self.__class__): + return NotImplemented + for attr in [ + "scheme", + "userinfo", + "host", + "query", + "fragment", + "port", + "uses_netloc", + "rooted", + ]: + if getattr(self, attr) != getattr(other, attr): + return False + if self.path == other.path or ( + self.path in _ROOT_PATHS and other.path in _ROOT_PATHS + ): + return True + return False + + def __ne__(self, other): + # type: (Any) -> bool + if not isinstance(other, self.__class__): + return NotImplemented + return not self.__eq__(other) + + def __hash__(self): + # type: () -> int + return hash( + ( + self.__class__, + self.scheme, + self.userinfo, + self.host, + self.path, + self.query, + self.fragment, + self.port, + self.rooted, + self.uses_netloc, + ) + ) + + @property + def absolute(self): + # type: () -> bool + """Whether or not the URL is "absolute". Absolute URLs are complete + enough to resolve to a network resource without being relative + to a base URI. + + >>> URL.from_text(u'http://wikipedia.org/').absolute + True + >>> URL.from_text(u'?a=b&c=d').absolute + False + + Absolute URLs must have both a scheme and a host set. + """ + return bool(self.scheme and self.host) + + def replace( + self, + scheme=_UNSET, # type: Optional[Text] + host=_UNSET, # type: Optional[Text] + path=_UNSET, # type: Iterable[Text] + query=_UNSET, # type: QueryParameters + fragment=_UNSET, # type: Text + port=_UNSET, # type: Optional[int] + rooted=_UNSET, # type: Optional[bool] + userinfo=_UNSET, # type: Text + uses_netloc=_UNSET, # type: Optional[bool] + ): + # type: (...) -> URL + """:class:`URL` objects are immutable, which means that attributes + are designed to be set only once, at construction. Instead of + modifying an existing URL, one simply creates a copy with the + desired changes. + + If any of the following arguments is omitted, it defaults to + the value on the current URL. + + Args: + scheme: The text name of the scheme. + host: The host portion of the network location. + path: A tuple of strings representing the slash-separated parts of + the path. + query: The query parameters, as a dictionary or as an sequence of + key-value pairs. + fragment: The fragment part of the URL. + port: The port part of the network location. + rooted: Whether or not the path begins with a slash. + userinfo: The username or colon-separated username:password pair. + uses_netloc: Indicates whether ``://`` (the "netloc separator") + will appear to separate the scheme from the *path* in cases + where no host is present. + Setting this to ``True`` is a non-spec-compliant affordance for + the common practice of having URIs that are *not* URLs (cannot + have a 'host' part) but nevertheless use the common ``://`` + idiom that most people associate with URLs; e.g. ``message:`` + URIs like ``message://message-id`` being equivalent to + ``message:message-id``. + This may be inferred based on the scheme depending on whether + :func:`register_scheme` has been used to register the scheme + and should not be passed directly unless you know the scheme + works like this and you know it has not been registered. + + Returns: + URL: A copy of the current :class:`URL`, with new values for + parameters passed. + """ + if scheme is not _UNSET and scheme != self.scheme: + # when changing schemes, reset the explicit uses_netloc preference + # to honor the new scheme. + uses_netloc = None + return self.__class__( + scheme=_optional(scheme, self.scheme), + host=_optional(host, self.host), + path=_optional(path, self.path), + query=_optional(query, self.query), + fragment=_optional(fragment, self.fragment), + port=_optional(port, self.port), + rooted=_optional(rooted, self.rooted), + userinfo=_optional(userinfo, self.userinfo), + uses_netloc=_optional(uses_netloc, self.uses_netloc), + ) + + @classmethod + def from_text(cls, text): + # type: (Text) -> URL + """Whereas the :class:`URL` constructor is useful for constructing + URLs from parts, :meth:`~URL.from_text` supports parsing whole + URLs from their string form:: + + >>> URL.from_text(u'http://example.com') + URL.from_text(u'http://example.com') + >>> URL.from_text(u'?a=b&x=y') + URL.from_text(u'?a=b&x=y') + + As you can see above, it's also used as the :func:`repr` of + :class:`URL` objects. The natural counterpart to + :func:`~URL.to_text()`. This method only accepts *text*, so be + sure to decode those bytestrings. + + Args: + text: A valid URL string. + + Returns: + URL: The structured object version of the parsed string. + + .. note:: + + Somewhat unexpectedly, URLs are a far more permissive + format than most would assume. Many strings which don't + look like URLs are still valid URLs. As a result, this + method only raises :class:`URLParseError` on invalid port + and IPv6 values in the host portion of the URL. + """ + um = _URL_RE.match(_textcheck("text", text)) + if um is None: + raise URLParseError("could not parse url: %r" % text) + gs = um.groupdict() + + au_text = gs["authority"] or u"" + au_m = _AUTHORITY_RE.match(au_text) + if au_m is None: + raise URLParseError( + "invalid authority %r in url: %r" % (au_text, text) + ) + au_gs = au_m.groupdict() + if au_gs["bad_host"]: + raise URLParseError( + "invalid host %r in url: %r" % (au_gs["bad_host"], text) + ) + + userinfo = au_gs["userinfo"] or u"" + + host = au_gs["ipv6_host"] or au_gs["plain_host"] + port = au_gs["port"] + if port is not None: + try: + port = int(port) # type: ignore[assignment] # FIXME, see below + except ValueError: + if not port: # TODO: excessive? + raise URLParseError("port must not be empty: %r" % au_text) + raise URLParseError("expected integer for port, not %r" % port) + + scheme = gs["scheme"] or u"" + fragment = gs["fragment"] or u"" + uses_netloc = bool(gs["_netloc_sep"]) + + if gs["path"]: + path = tuple(gs["path"].split(u"/")) + if not path[0]: + path = path[1:] + rooted = True + else: + rooted = False + else: + path = () + rooted = bool(au_text) + if gs["query"]: + query = tuple( + ( + qe.split(u"=", 1) # type: ignore[misc] + if u"=" in qe + else (qe, None) + ) + for qe in gs["query"].split(u"&") + ) # type: QueryPairs + else: + query = () + return cls( + scheme, + host, + path, + query, + fragment, + port, # type: ignore[arg-type] # FIXME, see above + rooted, + userinfo, + uses_netloc, + ) + + def normalize( + self, + scheme=True, + host=True, + path=True, + query=True, + fragment=True, + userinfo=True, + percents=True, + ): + # type: (bool, bool, bool, bool, bool, bool, bool) -> URL + """Return a new URL object with several standard normalizations + applied: + + * Decode unreserved characters (`RFC 3986 2.3`_) + * Uppercase remaining percent-encoded octets (`RFC 3986 2.1`_) + * Convert scheme and host casing to lowercase (`RFC 3986 3.2.2`_) + * Resolve any "." and ".." references in the path (`RFC 3986 6.2.2.3`_) + * Ensure an ending slash on URLs with an empty path (`RFC 3986 6.2.3`_) + * Encode any stray percent signs (`%`) in percent-encoded + fields (path, query, fragment, userinfo) (`RFC 3986 2.4`_) + + All are applied by default, but normalizations can be disabled + per-part by passing `False` for that part's corresponding + name. + + Args: + scheme: Convert the scheme to lowercase + host: Convert the host to lowercase + path: Normalize the path (see above for details) + query: Normalize the query string + fragment: Normalize the fragment + userinfo: Normalize the userinfo + percents: Encode isolated percent signs for any percent-encoded + fields which are being normalized (defaults to `True`). + + >>> url = URL.from_text(u'Http://example.COM/a/../b/./c%2f?%61%') + >>> print(url.normalize().to_text()) + http://example.com/b/c%2F?a%25 + + .. _RFC 3986 3.2.2: https://tools.ietf.org/html/rfc3986#section-3.2.2 + .. _RFC 3986 2.3: https://tools.ietf.org/html/rfc3986#section-2.3 + .. _RFC 3986 2.1: https://tools.ietf.org/html/rfc3986#section-2.1 + .. _RFC 3986 6.2.2.3: https://tools.ietf.org/html/rfc3986#section-6.2.2.3 + .. _RFC 3986 6.2.3: https://tools.ietf.org/html/rfc3986#section-6.2.3 + .. _RFC 3986 2.4: https://tools.ietf.org/html/rfc3986#section-2.4 + """ # noqa: E501 + kw = {} # type: Dict[str, Any] + if scheme: + kw["scheme"] = self.scheme.lower() + if host: + kw["host"] = self.host.lower() + + def _dec_unres(target): + # type: (Text) -> Text + return _decode_unreserved( + target, normalize_case=True, encode_stray_percents=percents + ) + + if path: + if self.path: + kw["path"] = [ + _dec_unres(p) for p in _resolve_dot_segments(self.path) + ] + else: + kw["path"] = (u"",) + if query: + kw["query"] = [ + (_dec_unres(k), _dec_unres(v) if v else v) + for k, v in self.query + ] + if fragment: + kw["fragment"] = _dec_unres(self.fragment) + if userinfo: + kw["userinfo"] = u":".join( + [_dec_unres(p) for p in self.userinfo.split(":", 1)] + ) + + return self.replace(**kw) + + def child(self, *segments): + # type: (Text) -> URL + """Make a new :class:`URL` where the given path segments are a child + of this URL, preserving other parts of the URL, including the + query string and fragment. + + For example:: + + >>> url = URL.from_text(u'http://localhost/a/b?x=y') + >>> child_url = url.child(u"c", u"d") + >>> child_url.to_text() + u'http://localhost/a/b/c/d?x=y' + + Args: + segments: Additional parts to be joined and added to the path, like + :func:`os.path.join`. Special characters in segments will be + percent encoded. + + Returns: + URL: A copy of the current URL with the extra path segments. + """ + if not segments: + return self + + segments = [ # type: ignore[assignment] # variable is tuple + _textcheck("path segment", s) for s in segments + ] + new_path = tuple(self.path) + if self.path and self.path[-1] == u"": + new_path = new_path[:-1] + new_path += tuple(_encode_path_parts(segments, maximal=False)) + return self.replace(path=new_path) + + def sibling(self, segment): + # type: (Text) -> URL + """Make a new :class:`URL` with a single path segment that is a + sibling of this URL path. + + Args: + segment: A single path segment. + + Returns: + URL: A copy of the current URL with the last path segment + replaced by *segment*. Special characters such as + ``/?#`` will be percent encoded. + """ + _textcheck("path segment", segment) + new_path = tuple(self.path)[:-1] + (_encode_path_part(segment),) + return self.replace(path=new_path) + + def click(self, href=u""): + # type: (Union[Text, URL]) -> URL + """Resolve the given URL relative to this URL. + + The resulting URI should match what a web browser would + generate if you visited the current URL and clicked on *href*. + + >>> url = URL.from_text(u'http://blog.hatnote.com/') + >>> url.click(u'/post/155074058790').to_text() + u'http://blog.hatnote.com/post/155074058790' + >>> url = URL.from_text(u'http://localhost/a/b/c/') + >>> url.click(u'../d/./e').to_text() + u'http://localhost/a/b/d/e' + + Args (Text): + href: A string representing a clicked URL. + + Return: + A copy of the current URL with navigation logic applied. + + For more information, see `RFC 3986 section 5`_. + + .. _RFC 3986 section 5: https://tools.ietf.org/html/rfc3986#section-5 + """ + if href: + if isinstance(href, URL): + clicked = href + else: + # TODO: This error message is not completely accurate, + # as URL objects are now also valid, but Twisted's + # test suite (wrongly) relies on this exact message. + _textcheck("relative URL", href) + clicked = URL.from_text(href) + if clicked.absolute: + return clicked + else: + clicked = self + + query = clicked.query + if clicked.scheme and not clicked.rooted: + # Schemes with relative paths are not well-defined. RFC 3986 calls + # them a "loophole in prior specifications" that should be avoided, + # or supported only for backwards compatibility. + raise NotImplementedError( + "absolute URI with rootless path: %r" % (href,) + ) + else: + if clicked.rooted: + path = clicked.path + elif clicked.path: + path = tuple(self.path)[:-1] + tuple(clicked.path) + else: + path = self.path + if not query: + query = self.query + return self.replace( + scheme=clicked.scheme or self.scheme, + host=clicked.host or self.host, + port=clicked.port or self.port, + path=_resolve_dot_segments(path), + query=query, + fragment=clicked.fragment, + ) + + def to_uri(self): + # type: () -> URL + u"""Make a new :class:`URL` instance with all non-ASCII characters + appropriately percent-encoded. This is useful to do in preparation + for sending a :class:`URL` over a network protocol. + + For example:: + + >>> URL.from_text(u'https://ايران.com/foo⇧bar/').to_uri() + URL.from_text(u'https://xn--mgba3a4fra.com/foo%E2%87%A7bar/') + + Returns: + URL: A new instance with its path segments, query parameters, and + hostname encoded, so that they are all in the standard + US-ASCII range. + """ + new_userinfo = u":".join( + [_encode_userinfo_part(p) for p in self.userinfo.split(":", 1)] + ) + new_path = _encode_path_parts( + self.path, has_scheme=bool(self.scheme), rooted=False, maximal=True + ) + new_host = ( + self.host + if not self.host + else idna_encode(self.host, uts46=True).decode("ascii") + ) + return self.replace( + userinfo=new_userinfo, + host=new_host, + path=new_path, + query=tuple( + [ + ( + _encode_query_key(k, maximal=True), + _encode_query_value(v, maximal=True) + if v is not None + else None, + ) + for k, v in self.query + ] + ), + fragment=_encode_fragment_part(self.fragment, maximal=True), + ) + + def to_iri(self): + # type: () -> URL + u"""Make a new :class:`URL` instance with all but a few reserved + characters decoded into human-readable format. + + Percent-encoded Unicode and IDNA-encoded hostnames are + decoded, like so:: + + >>> url = URL.from_text(u'https://xn--mgba3a4fra.example.com/foo%E2%87%A7bar/') + >>> print(url.to_iri().to_text()) + https://ايران.example.com/foo⇧bar/ + + .. note:: + + As a general Python issue, "narrow" (UCS-2) builds of + Python may not be able to fully decode certain URLs, and + the in those cases, this method will return a best-effort, + partially-decoded, URL which is still valid. This issue + does not affect any Python builds 3.4+. + + Returns: + URL: A new instance with its path segments, query parameters, and + hostname decoded for display purposes. + """ # noqa: E501 + new_userinfo = u":".join( + [_decode_userinfo_part(p) for p in self.userinfo.split(":", 1)] + ) + host_text = _decode_host(self.host) + + return self.replace( + userinfo=new_userinfo, + host=host_text, + path=[_decode_path_part(segment) for segment in self.path], + query=tuple( + ( + _decode_query_key(k), + _decode_query_value(v) if v is not None else None, + ) + for k, v in self.query + ), + fragment=_decode_fragment_part(self.fragment), + ) + + def to_text(self, with_password=False): + # type: (bool) -> Text + """Render this URL to its textual representation. + + By default, the URL text will *not* include a password, if one + is set. RFC 3986 considers using URLs to represent such + sensitive information as deprecated. Quoting from RFC 3986, + `section 3.2.1`: + + "Applications should not render as clear text any data after the + first colon (":") character found within a userinfo subcomponent + unless the data after the colon is the empty string (indicating no + password)." + + Args (bool): + with_password: Whether or not to include the password in the URL + text. Defaults to False. + + Returns: + Text: The serialized textual representation of this URL, such as + ``u"http://example.com/some/path?some=query"``. + + The natural counterpart to :class:`URL.from_text()`. + + .. _section 3.2.1: https://tools.ietf.org/html/rfc3986#section-3.2.1 + """ + scheme = self.scheme + authority = self.authority(with_password) + path = "/".join( + _encode_path_parts( + self.path, + rooted=self.rooted, + has_scheme=bool(scheme), + has_authority=bool(authority), + maximal=False, + ) + ) + query_parts = [] + for k, v in self.query: + if v is None: + query_parts.append(_encode_query_key(k, maximal=False)) + else: + query_parts.append( + u"=".join( + ( + _encode_query_key(k, maximal=False), + _encode_query_value(v, maximal=False), + ) + ) + ) + query_string = u"&".join(query_parts) + + fragment = self.fragment + + parts = [] # type: List[Text] + _add = parts.append + if scheme: + _add(scheme) + _add(":") + if authority: + _add("//") + _add(authority) + elif scheme and path[:2] != "//" and self.uses_netloc: + _add("//") + if path: + if scheme and authority and path[:1] != "/": + _add("/") # relpaths with abs authorities auto get '/' + _add(path) + if query_string: + _add("?") + _add(query_string) + if fragment: + _add("#") + _add(fragment) + return u"".join(parts) + + def __repr__(self): + # type: () -> str + """Convert this URL to an representation that shows all of its + constituent parts, as well as being a valid argument to + :func:`eval`. + """ + return "%s.from_text(%r)" % (self.__class__.__name__, self.to_text()) + + def _to_bytes(self): + # type: () -> bytes + """ + Allows for direct usage of URL objects with libraries like + requests, which automatically stringify URL parameters. See + issue #49. + """ + return self.to_uri().to_text().encode("ascii") + + if PY2: + __str__ = _to_bytes + __unicode__ = to_text + else: + __bytes__ = _to_bytes + __str__ = to_text + + # # Begin Twisted Compat Code + asURI = to_uri + asIRI = to_iri + + @classmethod + def fromText(cls, s): + # type: (Text) -> URL + return cls.from_text(s) + + def asText(self, includeSecrets=False): + # type: (bool) -> Text + return self.to_text(with_password=includeSecrets) + + def __dir__(self): + # type: () -> Sequence[Text] + try: + ret = object.__dir__(self) + except AttributeError: + # object.__dir__ == AttributeError # pdw for py2 + ret = dir(self.__class__) + list(self.__dict__.keys()) + ret = sorted(set(ret) - set(["fromText", "asURI", "asIRI", "asText"])) + return ret + + # # End Twisted Compat Code + + def add(self, name, value=None): + # type: (Text, Optional[Text]) -> URL + """Make a new :class:`URL` instance with a given query argument, + *name*, added to it with the value *value*, like so:: + + >>> URL.from_text(u'https://example.com/?x=y').add(u'x') + URL.from_text(u'https://example.com/?x=y&x') + >>> URL.from_text(u'https://example.com/?x=y').add(u'x', u'z') + URL.from_text(u'https://example.com/?x=y&x=z') + + Args: + name: The name of the query parameter to add. + The part before the ``=``. + value: The value of the query parameter to add. + The part after the ``=``. + Defaults to ``None``, meaning no value. + + Returns: + URL: A new :class:`URL` instance with the parameter added. + """ + return self.replace(query=self.query + ((name, value),)) + + def set(self, name, value=None): + # type: (Text, Optional[Text]) -> URL + """Make a new :class:`URL` instance with the query parameter *name* + set to *value*. All existing occurences, if any are replaced + by the single name-value pair. + + >>> URL.from_text(u'https://example.com/?x=y').set(u'x') + URL.from_text(u'https://example.com/?x') + >>> URL.from_text(u'https://example.com/?x=y').set(u'x', u'z') + URL.from_text(u'https://example.com/?x=z') + + Args: + name: The name of the query parameter to set. + The part before the ``=``. + value: The value of the query parameter to set. + The part after the ``=``. + Defaults to ``None``, meaning no value. + + Returns: + URL: A new :class:`URL` instance with the parameter set. + """ + # Preserve the original position of the query key in the list + q = [(k, v) for (k, v) in self.query if k != name] + idx = next( + (i for (i, (k, v)) in enumerate(self.query) if k == name), -1 + ) + q[idx:idx] = [(name, value)] + return self.replace(query=q) + + def get(self, name): + # type: (Text) -> List[Optional[Text]] + """Get a list of values for the given query parameter, *name*:: + + >>> url = URL.from_text(u'?x=1&x=2') + >>> url.get('x') + [u'1', u'2'] + >>> url.get('y') + [] + + If the given *name* is not set, an empty list is returned. A + list is always returned, and this method raises no exceptions. + + Args: + name: The name of the query parameter to get. + + Returns: + List[Optional[Text]]: A list of all the values associated with the + key, in string form. + """ + return [value for (key, value) in self.query if name == key] + + def remove( + self, + name, # type: Text + value=_UNSET, # type: Text + limit=None, # type: Optional[int] + ): + # type: (...) -> URL + """Make a new :class:`URL` instance with occurrences of the query + parameter *name* removed, or, if *value* is set, parameters + matching *name* and *value*. No exception is raised if the + parameter is not already set. + + Args: + name: The name of the query parameter to remove. + value: Optional value to additionally filter on. + Setting this removes query parameters which match both name + and value. + limit: Optional maximum number of parameters to remove. + + Returns: + URL: A new :class:`URL` instance with the parameter removed. + """ + if limit is None: + if value is _UNSET: + nq = [(k, v) for (k, v) in self.query if k != name] + else: + nq = [ + (k, v) + for (k, v) in self.query + if not (k == name and v == value) + ] + else: + nq, removed_count = [], 0 + + for k, v in self.query: + if ( + k == name + and (value is _UNSET or v == value) + and removed_count < limit + ): + removed_count += 1 # drop it + else: + nq.append((k, v)) # keep it + + return self.replace(query=nq) + + +EncodedURL = URL # An alias better describing what the URL really is + +_EMPTY_URL = URL() + + +def _replace_plus(text): + # type: (Text) -> Text + return text.replace("+", "%20") + + +def _no_op(text): + # type: (Text) -> Text + return text + + +class DecodedURL(object): + """ + :class:`DecodedURL` is a type designed to act as a higher-level + interface to :class:`URL` and the recommended type for most + operations. By analogy, :class:`DecodedURL` is the + :class:`unicode` to URL's :class:`bytes`. + + :class:`DecodedURL` automatically handles encoding and decoding + all its components, such that all inputs and outputs are in a + maximally-decoded state. Note that this means, for some special + cases, a URL may not "roundtrip" character-for-character, but this + is considered a good tradeoff for the safety of automatic + encoding. + + Otherwise, :class:`DecodedURL` has almost exactly the same API as + :class:`URL`. + + Where applicable, a UTF-8 encoding is presumed. Be advised that + some interactions can raise :exc:`UnicodeEncodeErrors` and + :exc:`UnicodeDecodeErrors`, just like when working with + bytestrings. Examples of such interactions include handling query + strings encoding binary data, and paths containing segments with + special characters encoded with codecs other than UTF-8. + + Args: + url: A :class:`URL` object to wrap. + lazy: Set to True to avoid pre-decode all parts of the URL to check for + validity. + Defaults to False. + query_plus_is_space: + characters in the query string should be treated + as spaces when decoding. If unspecified, the default is taken from + the scheme. + + .. note:: + + The :class:`DecodedURL` initializer takes a :class:`URL` object, + not URL components, like :class:`URL`. To programmatically + construct a :class:`DecodedURL`, you can use this pattern: + + >>> print(DecodedURL().replace(scheme=u'https', + ... host=u'pypi.org', path=(u'projects', u'hyperlink')).to_text()) + https://pypi.org/projects/hyperlink + + .. versionadded:: 18.0.0 + """ + + def __init__(self, url=_EMPTY_URL, lazy=False, query_plus_is_space=None): + # type: (URL, bool, Optional[bool]) -> None + self._url = url + if query_plus_is_space is None: + query_plus_is_space = url.scheme not in NO_QUERY_PLUS_SCHEMES + self._query_plus_is_space = query_plus_is_space + if not lazy: + # cache the following, while triggering any decoding + # issues with decodable fields + self.host, self.userinfo, self.path, self.query, self.fragment + return + + @classmethod + def from_text(cls, text, lazy=False, query_plus_is_space=None): + # type: (Text, bool, Optional[bool]) -> DecodedURL + """\ + Make a `DecodedURL` instance from any text string containing a URL. + + Args: + text: Text containing the URL + lazy: Whether to pre-decode all parts of the URL to check for + validity. + Defaults to True. + """ + _url = URL.from_text(text) + return cls(_url, lazy=lazy, query_plus_is_space=query_plus_is_space) + + @property + def encoded_url(self): + # type: () -> URL + """Access the underlying :class:`URL` object, which has any special + characters encoded. + """ + return self._url + + def to_text(self, with_password=False): + # type: (bool) -> Text + "Passthrough to :meth:`~hyperlink.URL.to_text()`" + return self._url.to_text(with_password) + + def to_uri(self): + # type: () -> URL + "Passthrough to :meth:`~hyperlink.URL.to_uri()`" + return self._url.to_uri() + + def to_iri(self): + # type: () -> URL + "Passthrough to :meth:`~hyperlink.URL.to_iri()`" + return self._url.to_iri() + + def _clone(self, url): + # type: (URL) -> DecodedURL + return self.__class__( + url, + # TODO: propagate laziness? + query_plus_is_space=self._query_plus_is_space, + ) + + def click(self, href=u""): + # type: (Union[Text, URL, DecodedURL]) -> DecodedURL + """Return a new DecodedURL wrapping the result of + :meth:`~hyperlink.URL.click()` + """ + if isinstance(href, DecodedURL): + href = href._url + return self._clone( + self._url.click(href=href), + ) + + def sibling(self, segment): + # type: (Text) -> DecodedURL + """Automatically encode any reserved characters in *segment* and + return a new `DecodedURL` wrapping the result of + :meth:`~hyperlink.URL.sibling()` + """ + return self._clone( + self._url.sibling(_encode_reserved(segment)), + ) + + def child(self, *segments): + # type: (Text) -> DecodedURL + """Automatically encode any reserved characters in *segments* and + return a new `DecodedURL` wrapping the result of + :meth:`~hyperlink.URL.child()`. + """ + if not segments: + return self + new_segs = [_encode_reserved(s) for s in segments] + return self._clone(self._url.child(*new_segs)) + + def normalize( + self, + scheme=True, + host=True, + path=True, + query=True, + fragment=True, + userinfo=True, + percents=True, + ): + # type: (bool, bool, bool, bool, bool, bool, bool) -> DecodedURL + """Return a new `DecodedURL` wrapping the result of + :meth:`~hyperlink.URL.normalize()` + """ + return self._clone( + self._url.normalize( + scheme, host, path, query, fragment, userinfo, percents + ) + ) + + @property + def absolute(self): + # type: () -> bool + return self._url.absolute + + @property + def scheme(self): + # type: () -> Text + return self._url.scheme + + @property + def host(self): + # type: () -> Text + return _decode_host(self._url.host) + + @property + def port(self): + # type: () -> Optional[int] + return self._url.port + + @property + def rooted(self): + # type: () -> bool + return self._url.rooted + + @property + def path(self): + # type: () -> Sequence[Text] + if not hasattr(self, "_path"): + self._path = tuple( + [ + _percent_decode(p, raise_subencoding_exc=True) + for p in self._url.path + ] + ) + return self._path + + @property + def query(self): + # type: () -> QueryPairs + if not hasattr(self, "_query"): + if self._query_plus_is_space: + predecode = _replace_plus + else: + predecode = _no_op + + self._query = cast( + QueryPairs, + tuple( + tuple( + _percent_decode( + predecode(x), raise_subencoding_exc=True + ) + if x is not None + else None + for x in (k, v) + ) + for k, v in self._url.query + ), + ) + return self._query + + @property + def fragment(self): + # type: () -> Text + if not hasattr(self, "_fragment"): + frag = self._url.fragment + self._fragment = _percent_decode(frag, raise_subencoding_exc=True) + return self._fragment + + @property + def userinfo(self): + # type: () -> Union[Tuple[str], Tuple[str, str]] + if not hasattr(self, "_userinfo"): + self._userinfo = cast( + Union[Tuple[str], Tuple[str, str]], + tuple( + tuple( + _percent_decode(p, raise_subencoding_exc=True) + for p in self._url.userinfo.split(":", 1) + ) + ), + ) + return self._userinfo + + @property + def user(self): + # type: () -> Text + return self.userinfo[0] + + @property + def uses_netloc(self): + # type: () -> Optional[bool] + return self._url.uses_netloc + + def replace( + self, + scheme=_UNSET, # type: Optional[Text] + host=_UNSET, # type: Optional[Text] + path=_UNSET, # type: Iterable[Text] + query=_UNSET, # type: QueryParameters + fragment=_UNSET, # type: Text + port=_UNSET, # type: Optional[int] + rooted=_UNSET, # type: Optional[bool] + userinfo=_UNSET, # type: Union[Tuple[str], Tuple[str, str]] + uses_netloc=_UNSET, # type: Optional[bool] + ): + # type: (...) -> DecodedURL + """While the signature is the same, this `replace()` differs a little + from URL.replace. For instance, it accepts userinfo as a + tuple, not as a string, handling the case of having a username + containing a `:`. As with the rest of the methods on + DecodedURL, if you pass a reserved character, it will be + automatically encoded instead of an error being raised. + """ + if path is not _UNSET: + path = tuple(_encode_reserved(p) for p in path) + if query is not _UNSET: + query = cast( + QueryPairs, + tuple( + tuple( + _encode_reserved(x) if x is not None else None + for x in (k, v) + ) + for k, v in iter_pairs(query) + ), + ) + if userinfo is not _UNSET: + if len(userinfo) > 2: + raise ValueError( + 'userinfo expected sequence of ["user"] or' + ' ["user", "password"], got %r' % (userinfo,) + ) + userinfo_text = u":".join([_encode_reserved(p) for p in userinfo]) + else: + userinfo_text = _UNSET + new_url = self._url.replace( + scheme=scheme, + host=host, + path=path, + query=query, + fragment=fragment, + port=port, + rooted=rooted, + userinfo=userinfo_text, + uses_netloc=uses_netloc, + ) + return self._clone(url=new_url) + + def get(self, name): + # type: (Text) -> List[Optional[Text]] + "Get the value of all query parameters whose name matches *name*" + return [v for (k, v) in self.query if name == k] + + def add(self, name, value=None): + # type: (Text, Optional[Text]) -> DecodedURL + """Return a new DecodedURL with the query parameter *name* and *value* + added.""" + return self.replace(query=self.query + ((name, value),)) + + def set(self, name, value=None): + # type: (Text, Optional[Text]) -> DecodedURL + "Return a new DecodedURL with query parameter *name* set to *value*" + query = self.query + q = [(k, v) for (k, v) in query if k != name] + idx = next((i for (i, (k, v)) in enumerate(query) if k == name), -1) + q[idx:idx] = [(name, value)] + return self.replace(query=q) + + def remove( + self, + name, # type: Text + value=_UNSET, # type: Text + limit=None, # type: Optional[int] + ): + # type: (...) -> DecodedURL + """Return a new DecodedURL with query parameter *name* removed. + + Optionally also filter for *value*, as well as cap the number + of parameters removed with *limit*. + """ + if limit is None: + if value is _UNSET: + nq = [(k, v) for (k, v) in self.query if k != name] + else: + nq = [ + (k, v) + for (k, v) in self.query + if not (k == name and v == value) + ] + else: + nq, removed_count = [], 0 + for k, v in self.query: + if ( + k == name + and (value is _UNSET or v == value) + and removed_count < limit + ): + removed_count += 1 # drop it + else: + nq.append((k, v)) # keep it + + return self.replace(query=nq) + + def __repr__(self): + # type: () -> str + cn = self.__class__.__name__ + return "%s(url=%r)" % (cn, self._url) + + def __str__(self): + # type: () -> str + # TODO: the underlying URL's __str__ needs to change to make + # this work as the URL, see #55 + return str(self._url) + + def __eq__(self, other): + # type: (Any) -> bool + if not isinstance(other, self.__class__): + return NotImplemented + return self.normalize().to_uri() == other.normalize().to_uri() + + def __ne__(self, other): + # type: (Any) -> bool + if not isinstance(other, self.__class__): + return NotImplemented + return not self.__eq__(other) + + def __hash__(self): + # type: () -> int + return hash( + ( + self.__class__, + self.scheme, + self.userinfo, + self.host, + self.path, + self.query, + self.fragment, + self.port, + self.rooted, + self.uses_netloc, + ) + ) + + # # Begin Twisted Compat Code + asURI = to_uri + asIRI = to_iri + + @classmethod + def fromText(cls, s, lazy=False): + # type: (Text, bool) -> DecodedURL + return cls.from_text(s, lazy=lazy) + + def asText(self, includeSecrets=False): + # type: (bool) -> Text + return self.to_text(with_password=includeSecrets) + + def __dir__(self): + # type: () -> Sequence[Text] + try: + ret = object.__dir__(self) + except AttributeError: + # object.__dir__ == AttributeError # pdw for py2 + ret = dir(self.__class__) + list(self.__dict__.keys()) + ret = sorted(set(ret) - set(["fromText", "asURI", "asIRI", "asText"])) + return ret + + # # End Twisted Compat Code + + +# Add some overloads so that parse gives a better return value. +@overload +def parse(url, decoded, lazy=False): + # type: (Text, Literal[False], bool) -> URL + """Passing decoded=False returns URL.""" + + +@overload +def parse(url, decoded=True, lazy=False): + # type: (Text, Literal[True], bool) -> DecodedURL + """Passing decoded=True (or the default value) returns DecodedURL.""" + + +@overload +def parse(url, decoded=True, lazy=False): + # type: (Text, bool, bool) -> Union[URL, DecodedURL] + """If decoded is not a literal we don't know the return type.""" + + +def parse(url, decoded=True, lazy=False): + # type: (Text, bool, bool) -> Union[URL, DecodedURL] + """ + Automatically turn text into a structured URL object. + + >>> url = parse(u"https://github.com/python-hyper/hyperlink") + >>> print(url.to_text()) + https://github.com/python-hyper/hyperlink + + Args: + url: A text string representation of a URL. + + decoded: Whether or not to return a :class:`DecodedURL`, + which automatically handles all + encoding/decoding/quoting/unquoting for all the various + accessors of parts of the URL, or a :class:`URL`, + which has the same API, but requires handling of special + characters for different parts of the URL. + + lazy: In the case of `decoded=True`, this controls + whether the URL is decoded immediately or as accessed. The + default, `lazy=False`, checks all encoded parts of the URL + for decodability. + + .. versionadded:: 18.0.0 + """ + enc_url = EncodedURL.from_text(url) + if not decoded: + return enc_url + dec_url = DecodedURL(enc_url, lazy=lazy) + return dec_url diff --git a/src/hyperlink/hypothesis.py b/src/hyperlink/hypothesis.py new file mode 100644 index 00000000..4ab987eb --- /dev/null +++ b/src/hyperlink/hypothesis.py @@ -0,0 +1,321 @@ +# -*- coding: utf-8 -*- +""" +Hypothesis strategies. +""" +from __future__ import absolute_import + +try: + import hypothesis + + del hypothesis +except ImportError: + from typing import Tuple + + __all__ = () # type: Tuple[str, ...] +else: + from csv import reader as csv_reader + from os.path import dirname, join + from string import ascii_letters, digits + from sys import maxunicode + from typing import ( + Callable, + Iterable, + List, + Optional, + Sequence, + Text, + TypeVar, + cast, + ) + from gzip import open as open_gzip + + from . import DecodedURL, EncodedURL + + from hypothesis import assume + from hypothesis.strategies import ( + composite, + integers, + lists, + sampled_from, + text, + ) + + from idna import IDNAError, check_label, encode as idna_encode + + __all__ = ( + "decoded_urls", + "encoded_urls", + "hostname_labels", + "hostnames", + "idna_text", + "paths", + "port_numbers", + ) + + T = TypeVar("T") + DrawCallable = Callable[[Callable[..., T]], T] + + try: + unichr + except NameError: # Py3 + unichr = chr # type: Callable[[int], Text] + + def idna_characters(): + # type: () -> Text + """ + Returns a string containing IDNA characters. + """ + global _idnaCharacters + + if not _idnaCharacters: + result = [] + + # Data source "IDNA Derived Properties": + # https://www.iana.org/assignments/idna-tables-6.3.0/ + # idna-tables-6.3.0.xhtml#idna-tables-properties + dataFileName = join( + dirname(__file__), "idna-tables-properties.csv.gz" + ) + with open_gzip(dataFileName) as dataFile: + reader = csv_reader( + (line.decode("utf-8") for line in dataFile), + delimiter=",", + ) + next(reader) # Skip header row + for row in reader: + codes, prop, description = row + + if prop != "PVALID": + # CONTEXTO or CONTEXTJ are also allowed, but they come + # with rules, so we're punting on those here. + # See: https://tools.ietf.org/html/rfc5892 + continue + + startEnd = row[0].split("-", 1) + if len(startEnd) == 1: + # No end of range given; use start + startEnd.append(startEnd[0]) + start, end = (int(i, 16) for i in startEnd) + + for i in range(start, end + 1): + if i > maxunicode: # Happens using Py2 on Windows + break + result.append(unichr(i)) + + _idnaCharacters = u"".join(result) + + return _idnaCharacters + + _idnaCharacters = "" # type: Text + + @composite + def idna_text(draw, min_size=1, max_size=None): + # type: (DrawCallable, int, Optional[int]) -> Text + """ + A strategy which generates IDNA-encodable text. + + @param min_size: The minimum number of characters in the text. + C{None} is treated as C{0}. + + @param max_size: The maximum number of characters in the text. + Use C{None} for an unbounded size. + """ + alphabet = idna_characters() + + assert min_size >= 1 + + if max_size is not None: + assert max_size >= 1 + + result = cast( + Text, + draw(text(min_size=min_size, max_size=max_size, alphabet=alphabet)), + ) + + # FIXME: There should be a more efficient way to ensure we produce + # valid IDNA text. + try: + idna_encode(result) + except IDNAError: + assume(False) + + return result + + @composite + def port_numbers(draw, allow_zero=False): + # type: (DrawCallable, bool) -> int + """ + A strategy which generates port numbers. + + @param allow_zero: Whether to allow port C{0} as a possible value. + """ + if allow_zero: + min_value = 0 + else: + min_value = 1 + + return cast(int, draw(integers(min_value=min_value, max_value=65535))) + + @composite + def hostname_labels(draw, allow_idn=True): + # type: (DrawCallable, bool) -> Text + """ + A strategy which generates host name labels. + + @param allow_idn: Whether to allow non-ASCII characters as allowed by + internationalized domain names (IDNs). + """ + if allow_idn: + label = cast(Text, draw(idna_text(min_size=1, max_size=63))) + + try: + label.encode("ascii") + except UnicodeEncodeError: + # If the label doesn't encode to ASCII, then we need to check + # the length of the label after encoding to punycode and adding + # the xn-- prefix. + while len(label.encode("punycode")) > 63 - len("xn--"): + # Rather than bombing out, just trim from the end until it + # is short enough, so hypothesis doesn't have to generate + # new data. + label = label[:-1] + + else: + label = cast( + Text, + draw( + text( + min_size=1, + max_size=63, + alphabet=Text(ascii_letters + digits + u"-"), + ) + ), + ) + + # Filter invalid labels. + # It would be better to reliably avoid generation of bogus labels in + # the first place, but it's hard... + try: + check_label(label) + except UnicodeError: # pragma: no cover (not always drawn) + assume(False) + + return label + + @composite + def hostnames(draw, allow_leading_digit=True, allow_idn=True): + # type: (DrawCallable, bool, bool) -> Text + """ + A strategy which generates host names. + + @param allow_leading_digit: Whether to allow a leading digit in host + names; they were not allowed prior to RFC 1123. + + @param allow_idn: Whether to allow non-ASCII characters as allowed by + internationalized domain names (IDNs). + """ + # Draw first label, filtering out labels with leading digits if needed + labels = [ + cast( + Text, + draw( + hostname_labels(allow_idn=allow_idn).filter( + lambda l: ( + True if allow_leading_digit else l[0] not in digits + ) + ) + ), + ) + ] + # Draw remaining labels + labels += cast( + List[Text], + draw( + lists( + hostname_labels(allow_idn=allow_idn), + min_size=1, + max_size=4, + ) + ), + ) + + # Trim off labels until the total host name length fits in 252 + # characters. This avoids having to filter the data. + while sum(len(label) for label in labels) + len(labels) - 1 > 252: + labels = labels[:-1] + + return u".".join(labels) + + def path_characters(): + # type: () -> str + """ + Returns a string containing valid URL path characters. + """ + global _path_characters + + if _path_characters is None: + + def chars(): + # type: () -> Iterable[Text] + for i in range(maxunicode): + c = unichr(i) + + # Exclude reserved characters + if c in "#/?": + continue + + # Exclude anything not UTF-8 compatible + try: + c.encode("utf-8") + except UnicodeEncodeError: + continue + + yield c + + _path_characters = "".join(chars()) + + return _path_characters + + _path_characters = None # type: Optional[str] + + @composite + def paths(draw): + # type: (DrawCallable) -> Sequence[Text] + return cast( + List[Text], + draw( + lists(text(min_size=1, alphabet=path_characters()), max_size=10) + ), + ) + + @composite + def encoded_urls(draw): + # type: (DrawCallable) -> EncodedURL + """ + A strategy which generates L{EncodedURL}s. + Call the L{EncodedURL.to_uri} method on each URL to get an HTTP + protocol-friendly URI. + """ + port = cast(Optional[int], draw(port_numbers(allow_zero=True))) + host = cast(Text, draw(hostnames())) + path = cast(Sequence[Text], draw(paths())) + + if port == 0: + port = None + + return EncodedURL( + scheme=cast(Text, draw(sampled_from((u"http", u"https")))), + host=host, + port=port, + path=path, + ) + + @composite + def decoded_urls(draw): + # type: (DrawCallable) -> DecodedURL + """ + A strategy which generates L{DecodedURL}s. + Call the L{EncodedURL.to_uri} method on each URL to get an HTTP + protocol-friendly URI. + """ + return DecodedURL(draw(encoded_urls())) diff --git a/src/hyperlink/idna-tables-properties.csv.gz b/src/hyperlink/idna-tables-properties.csv.gz new file mode 100644 index 00000000..48e9f067 Binary files /dev/null and b/src/hyperlink/idna-tables-properties.csv.gz differ diff --git a/src/hyperlink/py.typed b/src/hyperlink/py.typed new file mode 100644 index 00000000..d2dfd5e4 --- /dev/null +++ b/src/hyperlink/py.typed @@ -0,0 +1 @@ +# See: https://www.python.org/dev/peps/pep-0561/ diff --git a/src/hyperlink/test/__init__.py b/src/hyperlink/test/__init__.py new file mode 100644 index 00000000..e10ca70f --- /dev/null +++ b/src/hyperlink/test/__init__.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +""" +Tests for hyperlink +""" + +__all = () + + +def _init_hypothesis(): + # type: () -> None + from os import environ + + if "CI" in environ: + try: + from hypothesis import HealthCheck, settings + except ImportError: + return + + settings.register_profile( + "patience", + settings( + suppress_health_check=[ + HealthCheck.too_slow, + HealthCheck.filter_too_much, + ] + ), + ) + settings.load_profile("patience") + + +_init_hypothesis() diff --git a/src/hyperlink/test/common.py b/src/hyperlink/test/common.py new file mode 100644 index 00000000..ad3bd04a --- /dev/null +++ b/src/hyperlink/test/common.py @@ -0,0 +1,68 @@ +from typing import Any, Callable, Optional, Type +from unittest import TestCase + + +class HyperlinkTestCase(TestCase): + """This type mostly exists to provide a backwards-compatible + assertRaises method for Python 2.6 testing. + """ + + def assertRaises( # type: ignore[override] + self, + expected_exception, # type: Type[BaseException] + callableObj=None, # type: Optional[Callable[..., Any]] + *args, # type: Any + **kwargs # type: Any + ): + # type: (...) -> Any + """Fail unless an exception of class expected_exception is raised + by callableObj when invoked with arguments args and keyword + arguments kwargs. If a different type of exception is + raised, it will not be caught, and the test case will be + deemed to have suffered an error, exactly as for an + unexpected exception. + + If called with callableObj omitted or None, will return a + context object used like this:: + + with self.assertRaises(SomeException): + do_something() + + The context manager keeps a reference to the exception as + the 'exception' attribute. This allows you to inspect the + exception after the assertion:: + + with self.assertRaises(SomeException) as cm: + do_something() + the_exception = cm.exception + self.assertEqual(the_exception.error_code, 3) + """ + context = _AssertRaisesContext(expected_exception, self) + if callableObj is None: + return context + with context: + callableObj(*args, **kwargs) + + +class _AssertRaisesContext(object): + "A context manager used to implement HyperlinkTestCase.assertRaises." + + def __init__(self, expected, test_case): + # type: (Type[BaseException], TestCase) -> None + self.expected = expected + self.failureException = test_case.failureException + + def __enter__(self): + # type: () -> "_AssertRaisesContext" + return self + + def __exit__(self, exc_type, exc_value, tb): + # type: (Optional[Type[BaseException]], Any, Any) -> bool + if exc_type is None: + exc_name = self.expected.__name__ + raise self.failureException("%s not raised" % (exc_name,)) + if not issubclass(exc_type, self.expected): + # let unexpected exceptions pass through + return False + self.exception = exc_value # store for later retrieval + return True diff --git a/hyperlink/test/test_common.py b/src/hyperlink/test/test_common.py similarity index 80% rename from hyperlink/test/test_common.py rename to src/hyperlink/test/test_common.py index 1d61583c..dc5e5bb8 100644 --- a/hyperlink/test/test_common.py +++ b/src/hyperlink/test/test_common.py @@ -1,29 +1,28 @@ """ Tests for hyperlink.test.common """ +from typing import Any from unittest import TestCase from .common import HyperlinkTestCase class _ExpectedException(Exception): - """An exception used to test HyperlinkTestCase.assertRaises. - - """ + """An exception used to test HyperlinkTestCase.assertRaises.""" class _UnexpectedException(Exception): - """An exception used to test HyperlinkTestCase.assertRaises. - - """ + """An exception used to test HyperlinkTestCase.assertRaises.""" class TestHyperlink(TestCase): """Tests for HyperlinkTestCase""" def setUp(self): + # type: () -> None self.hyperlink_test = HyperlinkTestCase("run") def test_assertRaisesWithCallable(self): + # type: () -> None """HyperlinkTestCase.assertRaises does not raise an AssertionError when given a callable that, when called with the provided arguments, raises the expected exception. @@ -32,44 +31,51 @@ def test_assertRaisesWithCallable(self): called_with = [] def raisesExpected(*args, **kwargs): + # type: (Any, Any) -> None called_with.append((args, kwargs)) raise _ExpectedException - self.hyperlink_test.assertRaises(_ExpectedException, - raisesExpected, 1, keyword=True) + self.hyperlink_test.assertRaises( + _ExpectedException, raisesExpected, 1, keyword=True + ) self.assertEqual(called_with, [((1,), {"keyword": True})]) def test_assertRaisesWithCallableUnexpectedException(self): + # type: () -> None """When given a callable that raises an unexpected exception, HyperlinkTestCase.assertRaises raises that exception. """ def doesNotRaiseExpected(*args, **kwargs): + # type: (Any, Any) -> None raise _UnexpectedException try: - self.hyperlink_test.assertRaises(_ExpectedException, - doesNotRaiseExpected) + self.hyperlink_test.assertRaises( + _ExpectedException, doesNotRaiseExpected + ) except _UnexpectedException: pass def test_assertRaisesWithCallableDoesNotRaise(self): + # type: () -> None """HyperlinkTestCase.assertRaises raises an AssertionError when given a callable that, when called, does not raise any exception. """ def doesNotRaise(*args, **kwargs): - return True + # type: (Any, Any) -> None + pass try: - self.hyperlink_test.assertRaises(_ExpectedException, - doesNotRaise) + self.hyperlink_test.assertRaises(_ExpectedException, doesNotRaise) except AssertionError: pass def test_assertRaisesContextManager(self): + # type: () -> None """HyperlinkTestCase.assertRaises does not raise an AssertionError when used as a context manager with a suite that raises the expected exception. The context manager stores the exception @@ -79,9 +85,12 @@ def test_assertRaisesContextManager(self): with self.hyperlink_test.assertRaises(_ExpectedException) as cm: raise _ExpectedException - self.assertTrue(isinstance(cm.exception, _ExpectedException)) + self.assertTrue( # type: ignore[unreachable] + isinstance(cm.exception, _ExpectedException) + ) def test_assertRaisesContextManagerUnexpectedException(self): + # type: () -> None """When used as a context manager with a block that raises an unexpected exception, HyperlinkTestCase.assertRaises raises that unexpected exception. @@ -94,6 +103,7 @@ def test_assertRaisesContextManagerUnexpectedException(self): pass def test_assertRaisesContextManagerDoesNotRaise(self): + # type: () -> None """HyperlinkTestcase.assertRaises raises an AssertionError when used as a context manager with a block that does not raise any exception. diff --git a/src/hyperlink/test/test_decoded_url.py b/src/hyperlink/test/test_decoded_url.py new file mode 100644 index 00000000..48452579 --- /dev/null +++ b/src/hyperlink/test/test_decoded_url.py @@ -0,0 +1,256 @@ +# -*- coding: utf-8 -*- + +from __future__ import unicode_literals + +from typing import Dict, Union +from .. import DecodedURL, URL +from .._url import _percent_decode +from .common import HyperlinkTestCase + +BASIC_URL = "http://example.com/#" +TOTAL_URL = ( + "https://%75%73%65%72:%00%00%00%00@xn--bcher-kva.ch:8080/" + "a/nice%20nice/./path/?zot=23%25&zut#frég" +) + + +class TestURL(HyperlinkTestCase): + def test_durl_basic(self): + # type: () -> None + bdurl = DecodedURL.from_text(BASIC_URL) + assert bdurl.scheme == "http" + assert bdurl.host == "example.com" + assert bdurl.port == 80 + assert bdurl.path == ("",) + assert bdurl.fragment == "" + + durl = DecodedURL.from_text(TOTAL_URL) + + assert durl.scheme == "https" + assert durl.host == "bücher.ch" + assert durl.port == 8080 + assert durl.path == ("a", "nice nice", ".", "path", "") + assert durl.fragment == "frég" + assert durl.get("zot") == ["23%"] + + assert durl.user == "user" + assert durl.userinfo == ("user", "\0\0\0\0") + + def test_roundtrip_iri_parameter_values(self): + # type: () -> None + """ + .to_iri() should never modify the application-level data of a query + parameter. + """ + for value in ["hello", "goodbye", "+", "/", ":", "?"]: + self.assertEqual( + DecodedURL(DecodedURL().set("test", value).to_iri()).get( + "test" + ), + [value], + ) + + def test_roundtrip_uri_parameter_values(self): + # type: () -> None + """ + .to_uri() should never modify the application-level data of a query + parameter. + """ + for value in ["hello", "goodbye", "+", "/", ":", "?"]: + self.assertEqual( + DecodedURL(DecodedURL().set("test", value).to_uri()).get( + "test" + ), + [value], + ) + + def test_passthroughs(self): + # type: () -> None + + # just basic tests for the methods that more or less pass straight + # through to the underlying URL + + durl = DecodedURL.from_text(TOTAL_URL) + assert durl.sibling("te%t").path[-1] == "te%t" + assert durl.child("../test2%").path[-1] == "../test2%" + assert durl.child() == durl + assert durl.child() is durl + assert durl.click("/").path[-1] == "" + assert durl.user == "user" + + assert "." in durl.path + assert "." not in durl.normalize().path + + assert durl.to_uri().fragment == "fr%C3%A9g" + assert " " in durl.to_iri().path[1] + + assert durl.to_text(with_password=True) == TOTAL_URL + + assert durl.absolute + assert durl.rooted + + assert durl == durl.encoded_url.get_decoded_url() + + durl2 = DecodedURL.from_text(TOTAL_URL, lazy=True) + assert durl2 == durl2.encoded_url.get_decoded_url(lazy=True) + + assert ( + str(DecodedURL.from_text(BASIC_URL).child(" ")) + == "http://example.com/%20" + ) + + assert not (durl == 1) + assert durl != 1 + + def test_repr(self): + # type: () -> None + durl = DecodedURL.from_text(TOTAL_URL) + assert repr(durl) == "DecodedURL(url=" + repr(durl._url) + ")" + + def test_query_manipulation(self): + # type: () -> None + durl = DecodedURL.from_text(TOTAL_URL) + + assert durl.get("zot") == ["23%"] + durl = durl.add(" ", "space") + assert durl.get(" ") == ["space"] + durl = durl.set(" ", "spa%ed") + assert durl.get(" ") == ["spa%ed"] + + durl = DecodedURL(url=durl.to_uri()) + assert durl.get(" ") == ["spa%ed"] + durl = durl.remove(" ") + assert durl.get(" ") == [] + + durl = DecodedURL.from_text("/?%61rg=b&arg=c") + assert durl.get("arg") == ["b", "c"] + + assert durl.set("arg", "d").get("arg") == ["d"] + + durl = DecodedURL.from_text( + "https://example.com/a/b/?fóó=1&bar=2&fóó=3" + ) + assert durl.remove("fóó") == DecodedURL.from_text( + "https://example.com/a/b/?bar=2" + ) + assert durl.remove("fóó", value="1") == DecodedURL.from_text( + "https://example.com/a/b/?bar=2&fóó=3" + ) + assert durl.remove("fóó", limit=1) == DecodedURL.from_text( + "https://example.com/a/b/?bar=2&fóó=3" + ) + assert durl.remove("fóó", value="1", limit=0) == DecodedURL.from_text( + "https://example.com/a/b/?fóó=1&bar=2&fóó=3" + ) + + def test_equality_and_hashability(self): + # type: () -> None + durl = DecodedURL.from_text(TOTAL_URL) + durl2 = DecodedURL.from_text(TOTAL_URL) + burl = DecodedURL.from_text(BASIC_URL) + durl_uri = durl.to_uri() + + assert durl == durl + assert durl == durl2 + assert durl != burl + assert durl is not None + assert durl != durl._url + + AnyURL = Union[URL, DecodedURL] + + durl_map = {} # type: Dict[AnyURL, AnyURL] + durl_map[durl] = durl + durl_map[durl2] = durl2 + + assert len(durl_map) == 1 + + durl_map[burl] = burl + + assert len(durl_map) == 2 + + durl_map[durl_uri] = durl_uri + + assert len(durl_map) == 3 + + def test_replace_roundtrip(self): + # type: () -> None + durl = DecodedURL.from_text(TOTAL_URL) + + durl2 = durl.replace( + scheme=durl.scheme, + host=durl.host, + path=durl.path, + query=durl.query, + fragment=durl.fragment, + port=durl.port, + rooted=durl.rooted, + userinfo=durl.userinfo, + uses_netloc=durl.uses_netloc, + ) + + assert durl == durl2 + + def test_replace_userinfo(self): + # type: () -> None + durl = DecodedURL.from_text(TOTAL_URL) + with self.assertRaises(ValueError): + durl.replace( + userinfo=( # type: ignore[arg-type] + "user", + "pw", + "thiswillcauseafailure", + ) + ) + return + + def test_twisted_compat(self): + # type: () -> None + durl = DecodedURL.from_text(TOTAL_URL) + + assert durl == DecodedURL.fromText(TOTAL_URL) + assert "to_text" in dir(durl) + assert "asText" not in dir(durl) + assert durl.to_text() == durl.asText() + + def test_percent_decode_mixed(self): + # type: () -> None + + # See https://github.com/python-hyper/hyperlink/pull/59 for a + # nice discussion of the possibilities + assert _percent_decode("abcdé%C3%A9éfg") == "abcdéééfg" + + # still allow percent encoding in the case of an error + assert _percent_decode("abcdé%C3éfg") == "abcdé%C3éfg" + + # ...unless explicitly told otherwise + with self.assertRaises(UnicodeDecodeError): + _percent_decode("abcdé%C3éfg", raise_subencoding_exc=True) + + # when not encodable as subencoding + assert _percent_decode("é%25é", subencoding="ascii") == "é%25é" + + def test_click_decoded_url(self): + # type: () -> None + durl = DecodedURL.from_text(TOTAL_URL) + durl_dest = DecodedURL.from_text("/tëst") + + clicked = durl.click(durl_dest) + assert clicked.host == durl.host + assert clicked.path == durl_dest.path + assert clicked.path == ("tëst",) + + def test_decode_plus(self): + # type: () -> None + durl = DecodedURL.from_text("/x+y%2B?a=b+c%2B") + assert durl.path == ("x+y+",) + assert durl.get("a") == ["b c+"] + assert durl.query == (("a", "b c+"),) + + def test_decode_nonplussed(self): + # type: () -> None + durl = DecodedURL.from_text( + "/x+y%2B?a=b+c%2B", query_plus_is_space=False + ) + assert durl.path == ("x+y+",) + assert durl.get("a") == ["b+c+"] + assert durl.query == (("a", "b+c+"),) diff --git a/src/hyperlink/test/test_hypothesis.py b/src/hyperlink/test/test_hypothesis.py new file mode 100644 index 00000000..776ed7b7 --- /dev/null +++ b/src/hyperlink/test/test_hypothesis.py @@ -0,0 +1,214 @@ +# -*- coding: utf-8 -*- +""" +Tests for hyperlink.hypothesis. +""" + +try: + import hypothesis + + del hypothesis +except ImportError: + pass +else: + from string import digits + from typing import Sequence, Text + + try: + from unittest.mock import patch + except ImportError: + from mock import patch # type: ignore[misc] + + from hypothesis import given, settings + from hypothesis.strategies import SearchStrategy, data + + from idna import IDNAError, check_label, encode as idna_encode + + from .common import HyperlinkTestCase + from .. import DecodedURL, EncodedURL + from ..hypothesis import ( + DrawCallable, + composite, + decoded_urls, + encoded_urls, + hostname_labels, + hostnames, + idna_text, + paths, + port_numbers, + ) + + class TestHypothesisStrategies(HyperlinkTestCase): + """ + Tests for hyperlink.hypothesis. + """ + + @given(idna_text()) + def test_idna_text_valid(self, text): + # type: (Text) -> None + """ + idna_text() generates IDNA-encodable text. + """ + try: + idna_encode(text) + except IDNAError: # pragma: no cover + raise AssertionError("Invalid IDNA text: {!r}".format(text)) + + @given(data()) + def test_idna_text_min_max(self, data): + # type: (SearchStrategy) -> None + """ + idna_text() raises AssertionError if min_size is < 1. + """ + self.assertRaises(AssertionError, data.draw, idna_text(min_size=0)) + self.assertRaises(AssertionError, data.draw, idna_text(max_size=0)) + + @given(port_numbers()) + def test_port_numbers_bounds(self, port): + # type: (int) -> None + """ + port_numbers() generates integers between 1 and 65535, inclusive. + """ + self.assertGreaterEqual(port, 1) + self.assertLessEqual(port, 65535) + + @given(port_numbers(allow_zero=True)) + def test_port_numbers_bounds_allow_zero(self, port): + # type: (int) -> None + """ + port_numbers(allow_zero=True) generates integers between 0 and + 65535, inclusive. + """ + self.assertGreaterEqual(port, 0) + self.assertLessEqual(port, 65535) + + @given(hostname_labels()) + def test_hostname_labels_valid_idn(self, label): + # type: (Text) -> None + """ + hostname_labels() generates IDN host name labels. + """ + try: + check_label(label) + idna_encode(label) + except UnicodeError: # pragma: no cover + raise AssertionError("Invalid IDN label: {!r}".format(label)) + + @given(data()) + @settings(max_examples=10) + def test_hostname_labels_long_idn_punycode(self, data): + # type: (SearchStrategy) -> None + """ + hostname_labels() handles case where idna_text() generates text + that encoded to punycode ends up as longer than allowed. + """ + + @composite + def mock_idna_text(draw, min_size, max_size): + # type: (DrawCallable, int, int) -> Text + # We want a string that does not exceed max_size, but when + # encoded to punycode, does exceed max_size. + # So use a unicode character that is larger when encoded, + # "á" being a great example, and use it max_size times, which + # will be max_size * 3 in size when encoded. + return u"\N{LATIN SMALL LETTER A WITH ACUTE}" * max_size + + with patch("hyperlink.hypothesis.idna_text", mock_idna_text): + label = data.draw(hostname_labels()) + try: + check_label(label) + idna_encode(label) + except UnicodeError: # pragma: no cover + raise AssertionError( + "Invalid IDN label: {!r}".format(label) + ) + + @given(hostname_labels(allow_idn=False)) + def test_hostname_labels_valid_ascii(self, label): + # type: (Text) -> None + """ + hostname_labels() generates a ASCII host name labels. + """ + try: + check_label(label) + label.encode("ascii") + except UnicodeError: # pragma: no cover + raise AssertionError("Invalid ASCII label: {!r}".format(label)) + + @given(hostnames()) + def test_hostnames_idn(self, hostname): + # type: (Text) -> None + """ + hostnames() generates a IDN host names. + """ + try: + for label in hostname.split(u"."): + check_label(label) + idna_encode(hostname) + except UnicodeError: # pragma: no cover + raise AssertionError( + "Invalid IDN host name: {!r}".format(hostname) + ) + + @given(hostnames(allow_leading_digit=False)) + def test_hostnames_idn_nolead(self, hostname): + # type: (Text) -> None + """ + hostnames(allow_leading_digit=False) generates a IDN host names + without leading digits. + """ + self.assertTrue(hostname == hostname.lstrip(digits)) + + @given(hostnames(allow_idn=False)) + def test_hostnames_ascii(self, hostname): + # type: (Text) -> None + """ + hostnames() generates a ASCII host names. + """ + try: + for label in hostname.split(u"."): + check_label(label) + hostname.encode("ascii") + except UnicodeError: # pragma: no cover + raise AssertionError( + "Invalid ASCII host name: {!r}".format(hostname) + ) + + @given(hostnames(allow_leading_digit=False, allow_idn=False)) + def test_hostnames_ascii_nolead(self, hostname): + # type: (Text) -> None + """ + hostnames(allow_leading_digit=False, allow_idn=False) generates + ASCII host names without leading digits. + """ + self.assertTrue(hostname == hostname.lstrip(digits)) + + @given(paths()) + def test_paths(self, path): + # type: (Sequence[Text]) -> None + """ + paths() generates sequences of URL path components. + """ + text = u"/".join(path) + try: + text.encode("utf-8") + except UnicodeError: # pragma: no cover + raise AssertionError("Invalid URL path: {!r}".format(path)) + + for segment in path: + self.assertNotIn("#/?", segment) + + @given(encoded_urls()) + def test_encoded_urls(self, url): + # type: (EncodedURL) -> None + """ + encoded_urls() generates EncodedURLs. + """ + self.assertIsInstance(url, EncodedURL) + + @given(decoded_urls()) + def test_decoded_urls(self, url): + # type: (DecodedURL) -> None + """ + decoded_urls() generates DecodedURLs. + """ + self.assertIsInstance(url, DecodedURL) diff --git a/hyperlink/test/test_parse.py b/src/hyperlink/test/test_parse.py similarity index 64% rename from hyperlink/test/test_parse.py rename to src/hyperlink/test/test_parse.py index cd2e9c97..66b02709 100644 --- a/hyperlink/test/test_parse.py +++ b/src/hyperlink/test/test_parse.py @@ -5,24 +5,28 @@ from .common import HyperlinkTestCase from hyperlink import parse, EncodedURL, DecodedURL -BASIC_URL = 'http://example.com/#' -TOTAL_URL = "https://%75%73%65%72:%00%00%00%00@xn--bcher-kva.ch:8080/a/nice%20nice/./path/?zot=23%25&zut#frég" -UNDECODABLE_FRAG_URL = TOTAL_URL + '%C3' +BASIC_URL = "http://example.com/#" +TOTAL_URL = ( + "https://%75%73%65%72:%00%00%00%00@xn--bcher-kva.ch:8080" + "/a/nice%20nice/./path/?zot=23%25&zut#frég" +) +UNDECODABLE_FRAG_URL = TOTAL_URL + "%C3" # the %C3 above percent-decodes to an unpaired \xc3 byte which makes this # invalid utf8 class TestURL(HyperlinkTestCase): def test_parse(self): + # type: () -> None purl = parse(TOTAL_URL) assert isinstance(purl, DecodedURL) - assert purl.user == 'user' - assert purl.get('zot') == ['23%'] - assert purl.fragment == 'frég' + assert purl.user == "user" + assert purl.get("zot") == ["23%"] + assert purl.fragment == "frég" purl2 = parse(TOTAL_URL, decoded=False) assert isinstance(purl2, EncodedURL) - assert purl2.get('zot') == ['23%25'] + assert purl2.get("zot") == ["23%25"] with self.assertRaises(UnicodeDecodeError): purl3 = parse(UNDECODABLE_FRAG_URL) @@ -31,5 +35,3 @@ def test_parse(self): with self.assertRaises(UnicodeDecodeError): purl3.fragment - - return diff --git a/src/hyperlink/test/test_scheme_registration.py b/src/hyperlink/test/test_scheme_registration.py new file mode 100644 index 00000000..b43c91e3 --- /dev/null +++ b/src/hyperlink/test/test_scheme_registration.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals +from typing import cast + + +from .. import _url +from .common import HyperlinkTestCase +from .._url import register_scheme, URL, DecodedURL + + +class TestSchemeRegistration(HyperlinkTestCase): + def setUp(self): + # type: () -> None + self._orig_scheme_port_map = dict(_url.SCHEME_PORT_MAP) + self._orig_no_netloc_schemes = set(_url.NO_NETLOC_SCHEMES) + + def tearDown(self): + # type: () -> None + _url.SCHEME_PORT_MAP = self._orig_scheme_port_map + _url.NO_NETLOC_SCHEMES = self._orig_no_netloc_schemes + + def test_register_scheme_basic(self): + # type: () -> None + register_scheme("deltron", uses_netloc=True, default_port=3030) + + u1 = URL.from_text("deltron://example.com") + assert u1.scheme == "deltron" + assert u1.port == 3030 + assert u1.uses_netloc is True + + # test netloc works even when the original gives no indication + u2 = URL.from_text("deltron:") + u2 = u2.replace(host="example.com") + assert u2.to_text() == "deltron://example.com" + + # test default port means no emission + u3 = URL.from_text("deltron://example.com:3030") + assert u3.to_text() == "deltron://example.com" + + register_scheme("nonetron", default_port=3031) + u4 = URL(scheme="nonetron") + u4 = u4.replace(host="example.com") + assert u4.to_text() == "nonetron://example.com" + + def test_register_no_netloc_scheme(self): + # type: () -> None + register_scheme("noloctron", uses_netloc=False) + u4 = URL(scheme="noloctron") + u4 = u4.replace(path=("example", "path")) + assert u4.to_text() == "noloctron:example/path" + + def test_register_no_netloc_with_port(self): + # type: () -> None + with self.assertRaises(ValueError): + register_scheme("badnetlocless", uses_netloc=False, default_port=7) + + def test_invalid_uses_netloc(self): + # type: () -> None + with self.assertRaises(ValueError): + register_scheme("badnetloc", uses_netloc=cast(bool, None)) + with self.assertRaises(ValueError): + register_scheme("badnetloc", uses_netloc=cast(bool, object())) + + def test_register_invalid_uses_netloc(self): + # type: () -> None + with self.assertRaises(ValueError): + register_scheme("lol", uses_netloc=cast(bool, object())) + + def test_register_invalid_port(self): + # type: () -> None + with self.assertRaises(ValueError): + register_scheme("nope", default_port=cast(bool, object())) + + def test_register_no_quote_plus_scheme(self): + # type: () -> None + register_scheme("keepplus", query_plus_is_space=False) + plus_is_not_space = DecodedURL.from_text( + "keepplus://example.com/?q=a+b" + ) + plus_is_space = DecodedURL.from_text("https://example.com/?q=a+b") + assert plus_is_not_space.get("q") == ["a+b"] + assert plus_is_space.get("q") == ["a b"] diff --git a/src/hyperlink/test/test_socket.py b/src/hyperlink/test/test_socket.py new file mode 100644 index 00000000..5f83d45b --- /dev/null +++ b/src/hyperlink/test/test_socket.py @@ -0,0 +1,45 @@ +# mypy: always-true=inet_pton + +try: + from socket import inet_pton +except ImportError: + inet_pton = None # type: ignore[assignment] + +if not inet_pton: + import socket + + from .common import HyperlinkTestCase + from .._socket import inet_pton + + class TestSocket(HyperlinkTestCase): + def test_inet_pton_ipv4_valid(self): + # type: () -> None + data = inet_pton(socket.AF_INET, "127.0.0.1") + assert isinstance(data, bytes) + + def test_inet_pton_ipv4_bogus(self): + # type: () -> None + with self.assertRaises(socket.error): + inet_pton(socket.AF_INET, "blah") + + def test_inet_pton_ipv6_valid(self): + # type: () -> None + data = inet_pton(socket.AF_INET6, "::1") + assert isinstance(data, bytes) + + def test_inet_pton_ipv6_bogus(self): + # type: () -> None + with self.assertRaises(socket.error): + inet_pton(socket.AF_INET6, "blah") + + def test_inet_pton_bogus_family(self): + # type: () -> None + # Find an integer not associated with a known address family + i = int(socket.AF_INET6) + while True: + if i != socket.AF_INET and i != socket.AF_INET6: + break + i += 100 + + with self.assertRaises(socket.error): + inet_pton(i, "127.0.0.1") diff --git a/src/hyperlink/test/test_url.py b/src/hyperlink/test/test_url.py new file mode 100644 index 00000000..37c91726 --- /dev/null +++ b/src/hyperlink/test/test_url.py @@ -0,0 +1,1495 @@ +# -*- coding: utf-8 -*- + +# Copyright (c) Twisted Matrix Laboratories. +# See LICENSE for details. + +from __future__ import unicode_literals + +import sys +import socket +from typing import Any, Iterable, Optional, Text, Tuple, cast + +from .common import HyperlinkTestCase +from .. import URL, URLParseError +from .._url import inet_pton, SCHEME_PORT_MAP + + +PY2 = sys.version_info[0] == 2 +unicode = type("") + + +BASIC_URL = "http://www.foo.com/a/nice/path/?zot=23&zut" + +# Examples from RFC 3986 section 5.4, Reference Resolution Examples +relativeLinkBaseForRFC3986 = "http://a/b/c/d;p?q" +relativeLinkTestsForRFC3986 = [ + # "Normal" + # ('g:h', 'g:h'), # can't click on a scheme-having url without an abs path + ("g", "http://a/b/c/g"), + ("./g", "http://a/b/c/g"), + ("g/", "http://a/b/c/g/"), + ("/g", "http://a/g"), + ("//g", "http://g"), + ("?y", "http://a/b/c/d;p?y"), + ("g?y", "http://a/b/c/g?y"), + ("#s", "http://a/b/c/d;p?q#s"), + ("g#s", "http://a/b/c/g#s"), + ("g?y#s", "http://a/b/c/g?y#s"), + (";x", "http://a/b/c/;x"), + ("g;x", "http://a/b/c/g;x"), + ("g;x?y#s", "http://a/b/c/g;x?y#s"), + ("", "http://a/b/c/d;p?q"), + (".", "http://a/b/c/"), + ("./", "http://a/b/c/"), + ("..", "http://a/b/"), + ("../", "http://a/b/"), + ("../g", "http://a/b/g"), + ("../..", "http://a/"), + ("../../", "http://a/"), + ("../../g", "http://a/g"), + # Abnormal examples + # ".." cannot be used to change the authority component of a URI. + ("../../../g", "http://a/g"), + ("../../../../g", "http://a/g"), + # Only include "." and ".." when they are only part of a larger segment, + # not by themselves. + ("/./g", "http://a/g"), + ("/../g", "http://a/g"), + ("g.", "http://a/b/c/g."), + (".g", "http://a/b/c/.g"), + ("g..", "http://a/b/c/g.."), + ("..g", "http://a/b/c/..g"), + # Unnecessary or nonsensical forms of "." and "..". + ("./../g", "http://a/b/g"), + ("./g/.", "http://a/b/c/g/"), + ("g/./h", "http://a/b/c/g/h"), + ("g/../h", "http://a/b/c/h"), + ("g;x=1/./y", "http://a/b/c/g;x=1/y"), + ("g;x=1/../y", "http://a/b/c/y"), + # Separating the reference's query and fragment components from the path. + ("g?y/./x", "http://a/b/c/g?y/./x"), + ("g?y/../x", "http://a/b/c/g?y/../x"), + ("g#s/./x", "http://a/b/c/g#s/./x"), + ("g#s/../x", "http://a/b/c/g#s/../x"), +] + + +ROUNDTRIP_TESTS = ( + "http://localhost", + "http://localhost/", + "http://127.0.0.1/", + "http://[::127.0.0.1]/", + "http://[::1]/", + "http://localhost/foo", + "http://localhost/foo/", + "http://localhost/foo!!bar/", + "http://localhost/foo%20bar/", + "http://localhost/foo%2Fbar/", + "http://localhost/foo?n", + "http://localhost/foo?n=v", + "http://localhost/foo?n=/a/b", + "http://example.com/foo!@$bar?b!@z=123", + "http://localhost/asd?a=asd%20sdf/345", + "http://(%2525)/(%2525)?(%2525)&(%2525)=(%2525)#(%2525)", + "http://(%C3%A9)/(%C3%A9)?(%C3%A9)&(%C3%A9)=(%C3%A9)#(%C3%A9)", + "?sslrootcert=/Users/glyph/Downloads/rds-ca-2015-root.pem&sslmode=verify", + # from boltons.urlutils' tests + "http://googlewebsite.com/e-shops.aspx", + "http://example.com:8080/search?q=123&business=Nothing%20Special", + "http://hatnote.com:9000/?arg=1&arg=2&arg=3", + "https://xn--bcher-kva.ch", + "http://xn--ggbla1c4e.xn--ngbc5azd/", + "http://tools.ietf.org/html/rfc3986#section-3.4", + # 'http://wiki:pedia@hatnote.com', + "ftp://ftp.rfc-editor.org/in-notes/tar/RFCs0001-0500.tar.gz", + "http://[1080:0:0:0:8:800:200C:417A]/index.html", + "ssh://192.0.2.16:2222/", + "https://[::101.45.75.219]:80/?hi=bye", + "ldap://[::192.9.5.5]/dc=example,dc=com??sub?(sn=Jensen)", + "mailto:me@example.com?to=me@example.com&body=hi%20http://wikipedia.org", + "news:alt.rec.motorcycle", + "tel:+1-800-867-5309", + "urn:oasis:member:A00024:x", + ( + "magnet:?xt=urn:btih:1a42b9e04e122b97a5254e3df77ab3c4b7da725f&dn=Puppy%" + "20Linux%20precise-5.7.1.iso&tr=udp://tracker.openbittorrent.com:80&" + "tr=udp://tracker.publicbt.com:80&tr=udp://tracker.istole.it:6969&" + "tr=udp://tracker.ccc.de:80&tr=udp://open.demonii.com:1337" + ), + # percent-encoded delimiters in percent-encodable fields + "https://%3A@example.com/", # colon in username + "https://%40@example.com/", # at sign in username + "https://%2f@example.com/", # slash in username + "https://a:%3a@example.com/", # colon in password + "https://a:%40@example.com/", # at sign in password + "https://a:%2f@example.com/", # slash in password + "https://a:%3f@example.com/", # question mark in password + "https://example.com/%2F/", # slash in path + "https://example.com/%3F/", # question mark in path + "https://example.com/%23/", # hash in path + "https://example.com/?%23=b", # hash in query param name + "https://example.com/?%3D=b", # equals in query param name + "https://example.com/?%26=b", # ampersand in query param name + "https://example.com/?a=%23", # hash in query param value + "https://example.com/?a=%26", # ampersand in query param value + "https://example.com/?a=%3D", # equals in query param value + "https://example.com/?foo+bar=baz", # plus in query param name + "https://example.com/?foo=bar+baz", # plus in query param value + # double-encoded percent sign in all percent-encodable positions: + "http://(%2525):(%2525)@example.com/(%2525)/?(%2525)=(%2525)#(%2525)", + # colon in first part of schemeless relative url + "first_seg_rel_path__colon%3Anotok/second_seg__colon%3Aok", +) + + +class TestURL(HyperlinkTestCase): + """ + Tests for L{URL}. + """ + + def assertUnicoded(self, u): + # type: (URL) -> None + """ + The given L{URL}'s components should be L{unicode}. + + @param u: The L{URL} to test. + """ + self.assertTrue( + isinstance(u.scheme, unicode) or u.scheme is None, repr(u) + ) + self.assertTrue(isinstance(u.host, unicode) or u.host is None, repr(u)) + for seg in u.path: + self.assertEqual(type(seg), unicode, repr(u)) + for (_k, v) in u.query: + self.assertEqual(type(seg), unicode, repr(u)) + self.assertTrue(v is None or isinstance(v, unicode), repr(u)) + self.assertEqual(type(u.fragment), unicode, repr(u)) + + def assertURL( + self, + u, # type: URL + scheme, # type: Text + host, # type: Text + path, # type: Iterable[Text] + query, # type: Iterable[Tuple[Text, Optional[Text]]] + fragment, # type: Text + port, # type: Optional[int] + userinfo="", # type: Text + ): + # type: (...) -> None + """ + The given L{URL} should have the given components. + + @param u: The actual L{URL} to examine. + + @param scheme: The expected scheme. + + @param host: The expected host. + + @param path: The expected path. + + @param query: The expected query. + + @param fragment: The expected fragment. + + @param port: The expected port. + + @param userinfo: The expected userinfo. + """ + actual = ( + u.scheme, + u.host, + u.path, + u.query, + u.fragment, + u.port, + u.userinfo, + ) + expected = ( + scheme, + host, + tuple(path), + tuple(query), + fragment, + port, + u.userinfo, + ) + self.assertEqual(actual, expected) + + def test_initDefaults(self): + # type: () -> None + """ + L{URL} should have appropriate default values. + """ + + def check(u): + # type: (URL) -> None + self.assertUnicoded(u) + self.assertURL(u, "http", "", [], [], "", 80, "") + + check(URL("http", "")) + check(URL("http", "", [], [])) + check(URL("http", "", [], [], "")) + + def test_init(self): + # type: () -> None + """ + L{URL} should accept L{unicode} parameters. + """ + u = URL("s", "h", ["p"], [("k", "v"), ("k", None)], "f") + self.assertUnicoded(u) + self.assertURL(u, "s", "h", ["p"], [("k", "v"), ("k", None)], "f", None) + + self.assertURL( + URL("http", "\xe0", ["\xe9"], [("\u03bb", "\u03c0")], "\u22a5"), + "http", + "\xe0", + ["\xe9"], + [("\u03bb", "\u03c0")], + "\u22a5", + 80, + ) + + def test_initPercent(self): + # type: () -> None + """ + L{URL} should accept (and not interpret) percent characters. + """ + u = URL("s", "%68", ["%70"], [("%6B", "%76"), ("%6B", None)], "%66") + self.assertUnicoded(u) + self.assertURL( + u, "s", "%68", ["%70"], [("%6B", "%76"), ("%6B", None)], "%66", None + ) + + def test_repr(self): + # type: () -> None + """ + L{URL.__repr__} will display the canonical form of the URL, wrapped in + a L{URL.from_text} invocation, so that it is C{eval}-able but still + easy to read. + """ + self.assertEqual( + repr( + URL( + scheme="http", + host="foo", + path=["bar"], + query=[("baz", None), ("k", "v")], + fragment="frob", + ) + ), + "URL.from_text(%s)" % (repr("http://foo/bar?baz&k=v#frob"),), + ) + + def test_from_text(self): + # type: () -> None + """ + Round-tripping L{URL.from_text} with C{str} results in an equivalent + URL. + """ + urlpath = URL.from_text(BASIC_URL) + self.assertEqual(BASIC_URL, urlpath.to_text()) + + def test_roundtrip(self): + # type: () -> None + """ + L{URL.to_text} should invert L{URL.from_text}. + """ + for test in ROUNDTRIP_TESTS: + result = URL.from_text(test).to_text(with_password=True) + self.assertEqual(test, result) + + def test_roundtrip_double_iri(self): + # type: () -> None + for test in ROUNDTRIP_TESTS: + url = URL.from_text(test) + iri = url.to_iri() + double_iri = iri.to_iri() + assert iri == double_iri + + iri_text = iri.to_text(with_password=True) + double_iri_text = double_iri.to_text(with_password=True) + assert iri_text == double_iri_text + return + + def test_equality(self): + # type: () -> None + """ + Two URLs decoded using L{URL.from_text} will be equal (C{==}) if they + decoded same URL string, and unequal (C{!=}) if they decoded different + strings. + """ + urlpath = URL.from_text(BASIC_URL) + self.assertEqual(urlpath, URL.from_text(BASIC_URL)) + self.assertNotEqual( + urlpath, + URL.from_text( + "ftp://www.anotherinvaliddomain.com/" "foo/bar/baz/?zot=21&zut" + ), + ) + + def test_fragmentEquality(self): + # type: () -> None + """ + An URL created with the empty string for a fragment compares equal + to an URL created with an unspecified fragment. + """ + self.assertEqual(URL(fragment=""), URL()) + self.assertEqual( + URL.from_text("http://localhost/#"), + URL.from_text("http://localhost/"), + ) + + def test_child(self): + # type: () -> None + """ + L{URL.child} appends a new path segment, but does not affect the query + or fragment. + """ + urlpath = URL.from_text(BASIC_URL) + self.assertEqual( + "http://www.foo.com/a/nice/path/gong?zot=23&zut", + urlpath.child("gong").to_text(), + ) + self.assertEqual( + "http://www.foo.com/a/nice/path/gong%2F?zot=23&zut", + urlpath.child("gong/").to_text(), + ) + self.assertEqual( + "http://www.foo.com/a/nice/path/gong%2Fdouble?zot=23&zut", + urlpath.child("gong/double").to_text(), + ) + self.assertEqual( + "http://www.foo.com/a/nice/path/gong%2Fdouble%2F?zot=23&zut", + urlpath.child("gong/double/").to_text(), + ) + + def test_multiChild(self): + # type: () -> None + """ + L{URL.child} receives multiple segments as C{*args} and appends each in + turn. + """ + url = URL.from_text("http://example.com/a/b") + self.assertEqual( + url.child("c", "d", "e").to_text(), "http://example.com/a/b/c/d/e" + ) + + def test_childInitRoot(self): + # type: () -> None + """ + L{URL.child} of a L{URL} without a path produces a L{URL} with a single + path segment. + """ + childURL = URL(host="www.foo.com").child("c") + self.assertTrue(childURL.rooted) + self.assertEqual("http://www.foo.com/c", childURL.to_text()) + + def test_emptyChild(self): + # type: () -> None + """ + L{URL.child} without any new segments returns the original L{URL}. + """ + url = URL(host="www.foo.com") + self.assertEqual(url.child(), url) + + def test_sibling(self): + # type: () -> None + """ + L{URL.sibling} of a L{URL} replaces the last path segment, but does not + affect the query or fragment. + """ + urlpath = URL.from_text(BASIC_URL) + self.assertEqual( + "http://www.foo.com/a/nice/path/sister?zot=23&zut", + urlpath.sibling("sister").to_text(), + ) + # Use an url without trailing '/' to check child removal. + url_text = "http://www.foo.com/a/nice/path?zot=23&zut" + urlpath = URL.from_text(url_text) + self.assertEqual( + "http://www.foo.com/a/nice/sister?zot=23&zut", + urlpath.sibling("sister").to_text(), + ) + + def test_click(self): + # type: () -> None + """ + L{URL.click} interprets the given string as a relative URI-reference + and returns a new L{URL} interpreting C{self} as the base absolute URI. + """ + urlpath = URL.from_text(BASIC_URL) + # A null uri should be valid (return here). + self.assertEqual( + "http://www.foo.com/a/nice/path/?zot=23&zut", + urlpath.click("").to_text(), + ) + # A simple relative path remove the query. + self.assertEqual( + "http://www.foo.com/a/nice/path/click", + urlpath.click("click").to_text(), + ) + # An absolute path replace path and query. + self.assertEqual( + "http://www.foo.com/click", urlpath.click("/click").to_text() + ) + # Replace just the query. + self.assertEqual( + "http://www.foo.com/a/nice/path/?burp", + urlpath.click("?burp").to_text(), + ) + # One full url to another should not generate '//' between authority. + # and path + self.assertTrue( + "//foobar" + not in urlpath.click("http://www.foo.com/foobar").to_text() + ) + + # From a url with no query clicking a url with a query, the query + # should be handled properly. + u = URL.from_text("http://www.foo.com/me/noquery") + self.assertEqual( + "http://www.foo.com/me/17?spam=158", + u.click("/me/17?spam=158").to_text(), + ) + + # Check that everything from the path onward is removed when the click + # link has no path. + u = URL.from_text("http://localhost/foo?abc=def") + self.assertEqual( + u.click("http://www.python.org").to_text(), "http://www.python.org" + ) + + # https://twistedmatrix.com/trac/ticket/8184 + u = URL.from_text("http://hatnote.com/a/b/../c/./d/e/..") + res = "http://hatnote.com/a/c/d/" + self.assertEqual(u.click("").to_text(), res) + + # test click default arg is same as empty string above + self.assertEqual(u.click().to_text(), res) + + # test click on a URL instance + u = URL.fromText("http://localhost/foo/?abc=def") + u2 = URL.from_text("bar") + u3 = u.click(u2) + self.assertEqual(u3.to_text(), "http://localhost/foo/bar") + + def test_clickRFC3986(self): + # type: () -> None + """ + L{URL.click} should correctly resolve the examples in RFC 3986. + """ + base = URL.from_text(relativeLinkBaseForRFC3986) + for (ref, expected) in relativeLinkTestsForRFC3986: + self.assertEqual(base.click(ref).to_text(), expected) + + def test_clickSchemeRelPath(self): + # type: () -> None + """ + L{URL.click} should not accept schemes with relative paths. + """ + base = URL.from_text(relativeLinkBaseForRFC3986) + self.assertRaises(NotImplementedError, base.click, "g:h") + self.assertRaises(NotImplementedError, base.click, "http:h") + + def test_cloneUnchanged(self): + # type: () -> None + """ + Verify that L{URL.replace} doesn't change any of the arguments it + is passed. + """ + urlpath = URL.from_text("https://x:1/y?z=1#A") + self.assertEqual( + urlpath.replace( + urlpath.scheme, + urlpath.host, + urlpath.path, + urlpath.query, + urlpath.fragment, + urlpath.port, + ), + urlpath, + ) + self.assertEqual(urlpath.replace(), urlpath) + + def test_clickCollapse(self): + # type: () -> None + """ + L{URL.click} collapses C{.} and C{..} according to RFC 3986 section + 5.2.4. + """ + tests = [ + ["http://localhost/", ".", "http://localhost/"], + ["http://localhost/", "..", "http://localhost/"], + ["http://localhost/a/b/c", ".", "http://localhost/a/b/"], + ["http://localhost/a/b/c", "..", "http://localhost/a/"], + ["http://localhost/a/b/c", "./d/e", "http://localhost/a/b/d/e"], + ["http://localhost/a/b/c", "../d/e", "http://localhost/a/d/e"], + ["http://localhost/a/b/c", "/./d/e", "http://localhost/d/e"], + ["http://localhost/a/b/c", "/../d/e", "http://localhost/d/e"], + [ + "http://localhost/a/b/c/", + "../../d/e/", + "http://localhost/a/d/e/", + ], + ["http://localhost/a/./c", "../d/e", "http://localhost/d/e"], + ["http://localhost/a/./c/", "../d/e", "http://localhost/a/d/e"], + [ + "http://localhost/a/b/c/d", + "./e/../f/../g", + "http://localhost/a/b/c/g", + ], + ["http://localhost/a/b/c", "d//e", "http://localhost/a/b/d//e"], + ] + for start, click, expected in tests: + actual = URL.from_text(start).click(click).to_text() + self.assertEqual( + actual, + expected, + "{start}.click({click}) => {actual} not {expected}".format( + start=start, + click=repr(click), + actual=actual, + expected=expected, + ), + ) + + def test_queryAdd(self): + # type: () -> None + """ + L{URL.add} adds query parameters. + """ + self.assertEqual( + "http://www.foo.com/a/nice/path/?foo=bar", + URL.from_text("http://www.foo.com/a/nice/path/") + .add("foo", "bar") + .to_text(), + ) + self.assertEqual( + "http://www.foo.com/?foo=bar", + URL(host="www.foo.com").add("foo", "bar").to_text(), + ) + urlpath = URL.from_text(BASIC_URL) + self.assertEqual( + "http://www.foo.com/a/nice/path/?zot=23&zut&burp", + urlpath.add("burp").to_text(), + ) + self.assertEqual( + "http://www.foo.com/a/nice/path/?zot=23&zut&burp=xxx", + urlpath.add("burp", "xxx").to_text(), + ) + self.assertEqual( + "http://www.foo.com/a/nice/path/?zot=23&zut&burp=xxx&zing", + urlpath.add("burp", "xxx").add("zing").to_text(), + ) + # Note the inversion! + self.assertEqual( + "http://www.foo.com/a/nice/path/?zot=23&zut&zing&burp=xxx", + urlpath.add("zing").add("burp", "xxx").to_text(), + ) + # Note the two values for the same name. + self.assertEqual( + "http://www.foo.com/a/nice/path/?zot=23&zut&burp=xxx&zot=32", + urlpath.add("burp", "xxx").add("zot", "32").to_text(), + ) + + def test_querySet(self): + # type: () -> None + """ + L{URL.set} replaces query parameters by name. + """ + urlpath = URL.from_text(BASIC_URL) + self.assertEqual( + "http://www.foo.com/a/nice/path/?zot=32&zut", + urlpath.set("zot", "32").to_text(), + ) + # Replace name without value with name/value and vice-versa. + self.assertEqual( + "http://www.foo.com/a/nice/path/?zot&zut=itworked", + urlpath.set("zot").set("zut", "itworked").to_text(), + ) + # Q: what happens when the query has two values and we replace? + # A: we replace both values with a single one + self.assertEqual( + "http://www.foo.com/a/nice/path/?zot=32&zut", + urlpath.add("zot", "xxx").set("zot", "32").to_text(), + ) + + def test_queryRemove(self): + # type: () -> None + """ + L{URL.remove} removes instances of a query parameter. + """ + url = URL.from_text("https://example.com/a/b/?foo=1&bar=2&foo=3") + self.assertEqual( + url.remove("foo"), URL.from_text("https://example.com/a/b/?bar=2") + ) + + self.assertEqual( + url.remove(name="foo", value="1"), + URL.from_text("https://example.com/a/b/?bar=2&foo=3"), + ) + + self.assertEqual( + url.remove(name="foo", limit=1), + URL.from_text("https://example.com/a/b/?bar=2&foo=3"), + ) + + self.assertEqual( + url.remove(name="foo", value="1", limit=0), + URL.from_text("https://example.com/a/b/?foo=1&bar=2&foo=3"), + ) + + def test_parseEqualSignInParamValue(self): + # type: () -> None + """ + Every C{=}-sign after the first in a query parameter is simply included + in the value of the parameter. + """ + u = URL.from_text("http://localhost/?=x=x=x") + self.assertEqual(u.get(""), ["x=x=x"]) + self.assertEqual(u.to_text(), "http://localhost/?=x=x=x") + u = URL.from_text("http://localhost/?foo=x=x=x&bar=y") + self.assertEqual(u.query, (("foo", "x=x=x"), ("bar", "y"))) + self.assertEqual(u.to_text(), "http://localhost/?foo=x=x=x&bar=y") + + u = URL.from_text( + "https://example.com/?argument=3&argument=4&operator=%3D" + ) + iri = u.to_iri() + self.assertEqual(iri.get("operator"), ["="]) + # assert that the equals is not unnecessarily escaped + self.assertEqual(iri.to_uri().get("operator"), ["="]) + + def test_empty(self): + # type: () -> None + """ + An empty L{URL} should serialize as the empty string. + """ + self.assertEqual(URL().to_text(), "") + + def test_justQueryText(self): + # type: () -> None + """ + An L{URL} with query text should serialize as just query text. + """ + u = URL(query=[("hello", "world")]) + self.assertEqual(u.to_text(), "?hello=world") + + def test_identicalEqual(self): + # type: () -> None + """ + L{URL} compares equal to itself. + """ + u = URL.from_text("http://localhost/") + self.assertEqual(u, u) + + def test_similarEqual(self): + # type: () -> None + """ + URLs with equivalent components should compare equal. + """ + u1 = URL.from_text("http://u@localhost:8080/p/a/t/h?q=p#f") + u2 = URL.from_text("http://u@localhost:8080/p/a/t/h?q=p#f") + self.assertEqual(u1, u2) + + def test_differentNotEqual(self): + # type: () -> None + """ + L{URL}s that refer to different resources are both unequal (C{!=}) and + also not equal (not C{==}). + """ + u1 = URL.from_text("http://localhost/a") + u2 = URL.from_text("http://localhost/b") + self.assertFalse(u1 == u2, "%r != %r" % (u1, u2)) + self.assertNotEqual(u1, u2) + + def test_otherTypesNotEqual(self): + # type: () -> None + """ + L{URL} is not equal (C{==}) to other types. + """ + u = URL.from_text("http://localhost/") + self.assertFalse(u == 42, "URL must not equal a number.") + self.assertFalse(u == object(), "URL must not equal an object.") + self.assertNotEqual(u, 42) + self.assertNotEqual(u, object()) + + def test_identicalNotUnequal(self): + # type: () -> None + """ + Identical L{URL}s are not unequal (C{!=}) to each other. + """ + u = URL.from_text("http://u@localhost:8080/p/a/t/h?q=p#f") + self.assertFalse(u != u, "%r == itself" % u) + + def test_similarNotUnequal(self): + # type: () -> None + """ + Structurally similar L{URL}s are not unequal (C{!=}) to each other. + """ + u1 = URL.from_text("http://u@localhost:8080/p/a/t/h?q=p#f") + u2 = URL.from_text("http://u@localhost:8080/p/a/t/h?q=p#f") + self.assertFalse(u1 != u2, "%r == %r" % (u1, u2)) + + def test_differentUnequal(self): + # type: () -> None + """ + Structurally different L{URL}s are unequal (C{!=}) to each other. + """ + u1 = URL.from_text("http://localhost/a") + u2 = URL.from_text("http://localhost/b") + self.assertTrue(u1 != u2, "%r == %r" % (u1, u2)) + + def test_otherTypesUnequal(self): + # type: () -> None + """ + L{URL} is unequal (C{!=}) to other types. + """ + u = URL.from_text("http://localhost/") + self.assertTrue(u != 42, "URL must differ from a number.") + self.assertTrue(u != object(), "URL must be differ from an object.") + + def test_asURI(self): + # type: () -> None + """ + L{URL.asURI} produces an URI which converts any URI unicode encoding + into pure US-ASCII and returns a new L{URL}. + """ + unicodey = ( + "http://\N{LATIN SMALL LETTER E WITH ACUTE}.com/" + "\N{LATIN SMALL LETTER E}\N{COMBINING ACUTE ACCENT}" + "?\N{LATIN SMALL LETTER A}\N{COMBINING ACUTE ACCENT}=" + "\N{LATIN SMALL LETTER I}\N{COMBINING ACUTE ACCENT}" + "#\N{LATIN SMALL LETTER U}\N{COMBINING ACUTE ACCENT}" + ) + iri = URL.from_text(unicodey) + uri = iri.asURI() + self.assertEqual(iri.host, "\N{LATIN SMALL LETTER E WITH ACUTE}.com") + self.assertEqual( + iri.path[0], "\N{LATIN SMALL LETTER E}\N{COMBINING ACUTE ACCENT}" + ) + self.assertEqual(iri.to_text(), unicodey) + expectedURI = "http://xn--9ca.com/%C3%A9?%C3%A1=%C3%AD#%C3%BA" + actualURI = uri.to_text() + self.assertEqual( + actualURI, expectedURI, "%r != %r" % (actualURI, expectedURI) + ) + + def test_asIRI(self): + # type: () -> None + """ + L{URL.asIRI} decodes any percent-encoded text in the URI, making it + more suitable for reading by humans, and returns a new L{URL}. + """ + asciiish = "http://xn--9ca.com/%C3%A9?%C3%A1=%C3%AD#%C3%BA" + uri = URL.from_text(asciiish) + iri = uri.asIRI() + self.assertEqual(uri.host, "xn--9ca.com") + self.assertEqual(uri.path[0], "%C3%A9") + self.assertEqual(uri.to_text(), asciiish) + expectedIRI = ( + "http://\N{LATIN SMALL LETTER E WITH ACUTE}.com/" + "\N{LATIN SMALL LETTER E WITH ACUTE}" + "?\N{LATIN SMALL LETTER A WITH ACUTE}=" + "\N{LATIN SMALL LETTER I WITH ACUTE}" + "#\N{LATIN SMALL LETTER U WITH ACUTE}" + ) + actualIRI = iri.to_text() + self.assertEqual( + actualIRI, expectedIRI, "%r != %r" % (actualIRI, expectedIRI) + ) + + def test_badUTF8AsIRI(self): + # type: () -> None + """ + Bad UTF-8 in a path segment, query parameter, or fragment results in + that portion of the URI remaining percent-encoded in the IRI. + """ + urlWithBinary = "http://xn--9ca.com/%00%FF/%C3%A9" + uri = URL.from_text(urlWithBinary) + iri = uri.asIRI() + expectedIRI = ( + "http://\N{LATIN SMALL LETTER E WITH ACUTE}.com/" + "%00%FF/" + "\N{LATIN SMALL LETTER E WITH ACUTE}" + ) + actualIRI = iri.to_text() + self.assertEqual( + actualIRI, expectedIRI, "%r != %r" % (actualIRI, expectedIRI) + ) + + def test_alreadyIRIAsIRI(self): + # type: () -> None + """ + A L{URL} composed of non-ASCII text will result in non-ASCII text. + """ + unicodey = ( + "http://\N{LATIN SMALL LETTER E WITH ACUTE}.com/" + "\N{LATIN SMALL LETTER E}\N{COMBINING ACUTE ACCENT}" + "?\N{LATIN SMALL LETTER A}\N{COMBINING ACUTE ACCENT}=" + "\N{LATIN SMALL LETTER I}\N{COMBINING ACUTE ACCENT}" + "#\N{LATIN SMALL LETTER U}\N{COMBINING ACUTE ACCENT}" + ) + iri = URL.from_text(unicodey) + alsoIRI = iri.asIRI() + self.assertEqual(alsoIRI.to_text(), unicodey) + + def test_alreadyURIAsURI(self): + # type: () -> None + """ + A L{URL} composed of encoded text will remain encoded. + """ + expectedURI = "http://xn--9ca.com/%C3%A9?%C3%A1=%C3%AD#%C3%BA" + uri = URL.from_text(expectedURI) + actualURI = uri.asURI().to_text() + self.assertEqual(actualURI, expectedURI) + + def test_userinfo(self): + # type: () -> None + """ + L{URL.from_text} will parse the C{userinfo} portion of the URI + separately from the host and port. + """ + url = URL.from_text( + "http://someuser:somepassword@example.com/some-segment@ignore" + ) + self.assertEqual( + url.authority(True), "someuser:somepassword@example.com" + ) + self.assertEqual(url.authority(False), "someuser:@example.com") + self.assertEqual(url.userinfo, "someuser:somepassword") + self.assertEqual(url.user, "someuser") + self.assertEqual( + url.to_text(), "http://someuser:@example.com/some-segment@ignore" + ) + self.assertEqual( + url.replace(userinfo="someuser").to_text(), + "http://someuser@example.com/some-segment@ignore", + ) + + def test_portText(self): + # type: () -> None + """ + L{URL.from_text} parses custom port numbers as integers. + """ + portURL = URL.from_text("http://www.example.com:8080/") + self.assertEqual(portURL.port, 8080) + self.assertEqual(portURL.to_text(), "http://www.example.com:8080/") + + def test_mailto(self): + # type: () -> None + """ + Although L{URL} instances are mainly for dealing with HTTP, other + schemes (such as C{mailto:}) should work as well. For example, + L{URL.from_text}/L{URL.to_text} round-trips cleanly for a C{mailto:} + URL representing an email address. + """ + self.assertEqual( + URL.from_text("mailto:user@example.com").to_text(), + "mailto:user@example.com", + ) + + def test_httpWithoutHost(self): + # type: () -> None + """ + An HTTP URL without a hostname, but with a path, should also round-trip + cleanly. + """ + without_host = URL.from_text("http:relative-path") + self.assertEqual(without_host.host, "") + self.assertEqual(without_host.path, ("relative-path",)) + self.assertEqual(without_host.uses_netloc, False) + self.assertEqual(without_host.to_text(), "http:relative-path") + + def test_queryIterable(self): + # type: () -> None + """ + When a L{URL} is created with a C{query} argument, the C{query} + argument is converted into an N-tuple of 2-tuples, sensibly + handling dictionaries. + """ + expected = (("alpha", "beta"),) + url = URL(query=[("alpha", "beta")]) + self.assertEqual(url.query, expected) + url = URL(query={"alpha": "beta"}) + self.assertEqual(url.query, expected) + + def test_pathIterable(self): + # type: () -> None + """ + When a L{URL} is created with a C{path} argument, the C{path} is + converted into a tuple. + """ + url = URL(path=["hello", "world"]) + self.assertEqual(url.path, ("hello", "world")) + + def test_invalidArguments(self): + # type: () -> None + """ + Passing an argument of the wrong type to any of the constructor + arguments of L{URL} will raise a descriptive L{TypeError}. + + L{URL} typechecks very aggressively to ensure that its constitutent + parts are all properly immutable and to prevent confusing errors when + bad data crops up in a method call long after the code that called the + constructor is off the stack. + """ + + class Unexpected(object): + def __str__(self): + # type: () -> str + return "wrong" + + def __repr__(self): + # type: () -> str + return "" + + defaultExpectation = "unicode" if bytes is str else "str" + + def assertRaised(raised, expectation, name): + # type: (Any, Text, Text) -> None + self.assertEqual( + str(raised.exception), + "expected {0} for {1}, got {2}".format( + expectation, name, "" + ), + ) + + def check(param, expectation=defaultExpectation): + # type: (Any, str) -> None + with self.assertRaises(TypeError) as raised: + URL(**{param: Unexpected()}) # type: ignore[arg-type] + + assertRaised(raised, expectation, param) + + check("scheme") + check("host") + check("fragment") + check("rooted", "bool") + check("userinfo") + check("port", "int or NoneType") + + with self.assertRaises(TypeError) as raised: + URL(path=[cast(Text, Unexpected())]) + + assertRaised(raised, defaultExpectation, "path segment") + + with self.assertRaises(TypeError) as raised: + URL(query=[("name", cast(Text, Unexpected()))]) + + assertRaised( + raised, defaultExpectation + " or NoneType", "query parameter value" + ) + + with self.assertRaises(TypeError) as raised: + URL(query=[(cast(Text, Unexpected()), "value")]) + + assertRaised(raised, defaultExpectation, "query parameter name") + # No custom error message for this one, just want to make sure + # non-2-tuples don't get through. + + with self.assertRaises(TypeError): + URL(query=[cast(Tuple[Text, Text], Unexpected())]) + + with self.assertRaises(ValueError): + URL(query=[cast(Tuple[Text, Text], ("k", "v", "vv"))]) + + with self.assertRaises(ValueError): + URL(query=[cast(Tuple[Text, Text], ("k",))]) + + url = URL.from_text("https://valid.example.com/") + with self.assertRaises(TypeError) as raised: + url.child(cast(Text, Unexpected())) + assertRaised(raised, defaultExpectation, "path segment") + with self.assertRaises(TypeError) as raised: + url.sibling(cast(Text, Unexpected())) + assertRaised(raised, defaultExpectation, "path segment") + with self.assertRaises(TypeError) as raised: + url.click(cast(Text, Unexpected())) + assertRaised(raised, defaultExpectation, "relative URL") + + def test_technicallyTextIsIterableBut(self): + # type: () -> None + """ + Technically, L{str} (or L{unicode}, as appropriate) is iterable, but + C{URL(path="foo")} resulting in C{URL.from_text("f/o/o")} is never what + you want. + """ + with self.assertRaises(TypeError) as raised: + URL(path="foo") + self.assertEqual( + str(raised.exception), + "expected iterable of text for path, not: {0}".format(repr("foo")), + ) + + def test_netloc(self): + # type: () -> None + url = URL(scheme="https") + self.assertEqual(url.uses_netloc, True) + self.assertEqual(url.to_text(), "https://") + # scheme, no host, no path, no netloc hack + self.assertEqual(URL.from_text("https:").uses_netloc, False) + # scheme, no host, absolute path, no netloc hack + self.assertEqual(URL.from_text("https:/").uses_netloc, False) + # scheme, no host, no path, netloc hack to indicate :// syntax + self.assertEqual(URL.from_text("https://").uses_netloc, True) + + url = URL(scheme="https", uses_netloc=False) + self.assertEqual(url.uses_netloc, False) + self.assertEqual(url.to_text(), "https:") + + url = URL(scheme="git+https") + self.assertEqual(url.uses_netloc, True) + self.assertEqual(url.to_text(), "git+https://") + + url = URL(scheme="mailto") + self.assertEqual(url.uses_netloc, False) + self.assertEqual(url.to_text(), "mailto:") + + url = URL(scheme="ztp") + self.assertEqual(url.uses_netloc, None) + self.assertEqual(url.to_text(), "ztp:") + + url = URL.from_text("ztp://test.com") + self.assertEqual(url.uses_netloc, True) + + url = URL.from_text("ztp:test:com") + self.assertEqual(url.uses_netloc, False) + + def test_ipv6_with_port(self): + # type: () -> None + t = "https://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]:80/" + url = URL.from_text(t) + assert url.host == "2001:0db8:85a3:0000:0000:8a2e:0370:7334" + assert url.port == 80 + assert SCHEME_PORT_MAP[url.scheme] != url.port + + def test_basic(self): + # type: () -> None + text = "https://user:pass@example.com/path/to/here?k=v#nice" + url = URL.from_text(text) + assert url.scheme == "https" + assert url.userinfo == "user:pass" + assert url.host == "example.com" + assert url.path == ("path", "to", "here") + assert url.fragment == "nice" + + text = "https://user:pass@127.0.0.1/path/to/here?k=v#nice" + url = URL.from_text(text) + assert url.scheme == "https" + assert url.userinfo == "user:pass" + assert url.host == "127.0.0.1" + assert url.path == ("path", "to", "here") + + text = "https://user:pass@[::1]/path/to/here?k=v#nice" + url = URL.from_text(text) + assert url.scheme == "https" + assert url.userinfo == "user:pass" + assert url.host == "::1" + assert url.path == ("path", "to", "here") + + def test_invalid_url(self): + # type: () -> None + self.assertRaises(URLParseError, URL.from_text, "#\n\n") + + def test_invalid_authority_url(self): + # type: () -> None + self.assertRaises(URLParseError, URL.from_text, "http://abc:\n\n/#") + + def test_invalid_ipv6(self): + # type: () -> None + invalid_ipv6_ips = [ + "2001::0234:C1ab::A0:aabc:003F", + "2001::1::3F", + ":", + "::::", + "::256.0.0.1", + ] + for ip in invalid_ipv6_ips: + url_text = "http://[" + ip + "]" + self.assertRaises(socket.error, inet_pton, socket.AF_INET6, ip) + self.assertRaises(URLParseError, URL.from_text, url_text) + + def test_invalid_port(self): + # type: () -> None + self.assertRaises(URLParseError, URL.from_text, "ftp://portmouth:smash") + self.assertRaises( + ValueError, + URL.from_text, + "http://reader.googlewebsite.com:neverforget", + ) + + def test_idna(self): + # type: () -> None + u1 = URL.from_text("http://bücher.ch") + self.assertEqual(u1.host, "bücher.ch") + self.assertEqual(u1.to_text(), "http://bücher.ch") + self.assertEqual(u1.to_uri().to_text(), "http://xn--bcher-kva.ch") + + u2 = URL.from_text("https://xn--bcher-kva.ch") + self.assertEqual(u2.host, "xn--bcher-kva.ch") + self.assertEqual(u2.to_text(), "https://xn--bcher-kva.ch") + self.assertEqual(u2.to_iri().to_text(), "https://bücher.ch") + + def test_netloc_slashes(self): + # type: () -> None + + # basic sanity checks + url = URL.from_text("mailto:mahmoud@hatnote.com") + self.assertEqual(url.scheme, "mailto") + self.assertEqual(url.to_text(), "mailto:mahmoud@hatnote.com") + + url = URL.from_text("http://hatnote.com") + self.assertEqual(url.scheme, "http") + self.assertEqual(url.to_text(), "http://hatnote.com") + + # test that unrecognized schemes stay consistent with '//' + url = URL.from_text("newscheme:a:b:c") + self.assertEqual(url.scheme, "newscheme") + self.assertEqual(url.to_text(), "newscheme:a:b:c") + + url = URL.from_text("newerscheme://a/b/c") + self.assertEqual(url.scheme, "newerscheme") + self.assertEqual(url.to_text(), "newerscheme://a/b/c") + + # test that reasonable guesses are made + url = URL.from_text("git+ftp://gitstub.biz/glyph/lefkowitz") + self.assertEqual(url.scheme, "git+ftp") + self.assertEqual(url.to_text(), "git+ftp://gitstub.biz/glyph/lefkowitz") + + url = URL.from_text("what+mailto:freerealestate@enotuniq.org") + self.assertEqual(url.scheme, "what+mailto") + self.assertEqual( + url.to_text(), "what+mailto:freerealestate@enotuniq.org" + ) + + url = URL(scheme="ztp", path=("x", "y", "z"), rooted=True) + self.assertEqual(url.to_text(), "ztp:/x/y/z") + + # also works when the input doesn't include '//' + url = URL( + scheme="git+ftp", + path=("x", "y", "z", ""), + rooted=True, + uses_netloc=True, + ) + # broken bc urlunsplit + self.assertEqual(url.to_text(), "git+ftp:///x/y/z/") + + # really why would this ever come up but ok + url = URL.from_text("file:///path/to/heck") + url2 = url.replace(scheme="mailto") + self.assertEqual(url2.to_text(), "mailto:/path/to/heck") + + url_text = "unregisteredscheme:///a/b/c" + url = URL.from_text(url_text) + no_netloc_url = url.replace(uses_netloc=False) + self.assertEqual(no_netloc_url.to_text(), "unregisteredscheme:/a/b/c") + netloc_url = url.replace(uses_netloc=True) + self.assertEqual(netloc_url.to_text(), url_text) + + return + + def test_rooted_to_relative(self): + # type: () -> None + """ + On host-relative URLs, the C{rooted} flag can be updated to indicate + that the path should no longer be treated as absolute. + """ + a = URL(path=["hello"]) + self.assertEqual(a.to_text(), "hello") + b = a.replace(rooted=True) + self.assertEqual(b.to_text(), "/hello") + self.assertNotEqual(a, b) + + def test_autorooted(self): + # type: () -> None + """ + The C{rooted} flag can be updated in some cases, but it cannot be made + to conflict with other facts surrounding the URL; for example, all URLs + involving an authority (host) are inherently rooted because it is not + syntactically possible to express otherwise; also, once an unrooted URL + gains a path that starts with an empty string, that empty string is + elided and it becomes rooted, because these cases are syntactically + indistinguisable in real URL text. + """ + relative_path_rooted = URL(path=["", "foo"], rooted=False) + self.assertEqual(relative_path_rooted.rooted, True) + relative_flag_rooted = URL(path=["foo"], rooted=True) + self.assertEqual(relative_flag_rooted.rooted, True) + self.assertEqual(relative_path_rooted, relative_flag_rooted) + + attempt_unrooted_absolute = URL(host="foo", path=["bar"], rooted=False) + normal_absolute = URL(host="foo", path=["bar"]) + self.assertEqual(attempt_unrooted_absolute, normal_absolute) + self.assertEqual(normal_absolute.rooted, True) + self.assertEqual(attempt_unrooted_absolute.rooted, True) + + def test_rooted_with_port_but_no_host(self): + # type: () -> None + """ + URLs which include a ``://`` netloc-separator for any reason are + inherently rooted, regardless of the value or presence of the + ``rooted`` constructor argument. + + They may include a netloc-separator because their constructor was + directly invoked with an explicit host or port, or because they were + parsed from a string which included the literal ``://`` separator. + """ + directly_constructed = URL(scheme="udp", port=4900, rooted=False) + directly_constructed_implict = URL(scheme="udp", port=4900) + directly_constructed_rooted = URL(scheme="udp", port=4900, rooted=True) + self.assertEqual(directly_constructed.rooted, True) + self.assertEqual(directly_constructed_implict.rooted, True) + self.assertEqual(directly_constructed_rooted.rooted, True) + parsed = URL.from_text("udp://:4900") + self.assertEqual(str(directly_constructed), str(parsed)) + self.assertEqual(str(directly_constructed_implict), str(parsed)) + self.assertEqual(directly_constructed.asText(), parsed.asText()) + self.assertEqual(directly_constructed, parsed) + self.assertEqual(directly_constructed, directly_constructed_implict) + self.assertEqual(directly_constructed, directly_constructed_rooted) + self.assertEqual(directly_constructed_implict, parsed) + self.assertEqual(directly_constructed_rooted, parsed) + + def test_wrong_constructor(self): + # type: () -> None + with self.assertRaises(ValueError): + # whole URL not allowed + URL(BASIC_URL) + with self.assertRaises(ValueError): + # explicitly bad scheme not allowed + URL("HTTP_____more_like_imHoTTeP") + + def test_encoded_userinfo(self): + # type: () -> None + url = URL.from_text("http://user:pass@example.com") + assert url.userinfo == "user:pass" + url = url.replace(userinfo="us%20her:pass") + iri = url.to_iri() + assert ( + iri.to_text(with_password=True) == "http://us her:pass@example.com" + ) + assert iri.to_text(with_password=False) == "http://us her:@example.com" + assert ( + iri.to_uri().to_text(with_password=True) + == "http://us%20her:pass@example.com" + ) + + def test_hash(self): + # type: () -> None + url_map = {} + url1 = URL.from_text("http://blog.hatnote.com/ask?utm_source=geocity") + assert hash(url1) == hash(url1) # sanity + + url_map[url1] = 1 + + url2 = URL.from_text("http://blog.hatnote.com/ask") + url2 = url2.set("utm_source", "geocity") + + url_map[url2] = 2 + + assert len(url_map) == 1 + assert list(url_map.values()) == [2] + + assert hash(URL()) == hash(URL()) # slightly more sanity + + def test_dir(self): + # type: () -> None + url = URL() + res = dir(url) + + assert len(res) > 15 + # twisted compat + assert "fromText" not in res + assert "asText" not in res + assert "asURI" not in res + assert "asIRI" not in res + + def test_twisted_compat(self): + # type: () -> None + url = URL.fromText("http://example.com/a%20té%C3%A9st") + assert url.asText() == "http://example.com/a%20té%C3%A9st" + assert url.asURI().asText() == "http://example.com/a%20t%C3%A9%C3%A9st" + # TODO: assert url.asIRI().asText() == u'http://example.com/a%20téést' + + def test_set_ordering(self): + # type: () -> None + + # TODO + url = URL.from_text("http://example.com/?a=b&c") + url = url.set("x", "x") + url = url.add("x", "y") + assert url.to_text() == "http://example.com/?a=b&x=x&c&x=y" + # Would expect: + # assert url.to_text() == u'http://example.com/?a=b&c&x=x&x=y' + + def test_schemeless_path(self): + # type: () -> None + "See issue #4" + u1 = URL.from_text("urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob") + u2 = URL.from_text(u1.to_text()) + assert u1 == u2 # sanity testing roundtripping + + u3 = URL.from_text(u1.to_iri().to_text()) + assert u1 == u3 + assert u2 == u3 + + # test that colons are ok past the first segment + u4 = URL.from_text("first-segment/urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob") + u5 = u4.to_iri() + assert u5.to_text() == "first-segment/urn:ietf:wg:oauth:2.0:oob" + + u6 = URL.from_text(u5.to_text()).to_uri() + assert u5 == u6 # colons stay decoded bc they're not in the first seg + + def test_emoji_domain(self): + # type: () -> None + "See issue #7, affecting only narrow builds (2.6-3.3)" + url = URL.from_text("https://xn--vi8hiv.ws") + iri = url.to_iri() + iri.to_text() + # as long as we don't get ValueErrors, we're good + + def test_delim_in_param(self): + # type: () -> None + "Per issue #6 and #8" + self.assertRaises(ValueError, URL, scheme="http", host="a/c") + self.assertRaises(ValueError, URL, path=("?",)) + self.assertRaises(ValueError, URL, path=("#",)) + self.assertRaises(ValueError, URL, query=(("&", "test"))) + + def test_empty_paths_eq(self): + # type: () -> None + u1 = URL.from_text("http://example.com/") + u2 = URL.from_text("http://example.com") + + assert u1 == u2 + + u1 = URL.from_text("http://example.com") + u2 = URL.from_text("http://example.com") + + assert u1 == u2 + + u1 = URL.from_text("http://example.com") + u2 = URL.from_text("http://example.com/") + + assert u1 == u2 + + u1 = URL.from_text("http://example.com/") + u2 = URL.from_text("http://example.com/") + + assert u1 == u2 + + def test_from_text_type(self): + # type: () -> None + assert URL.from_text("#ok").fragment == "ok" # sanity + self.assertRaises(TypeError, URL.from_text, b"bytes://x.y.z") + self.assertRaises(TypeError, URL.from_text, object()) + + def test_from_text_bad_authority(self): + # type: () -> None + + # bad ipv6 brackets + self.assertRaises(URLParseError, URL.from_text, "http://[::1/") + self.assertRaises(URLParseError, URL.from_text, "http://::1]/") + self.assertRaises(URLParseError, URL.from_text, "http://[[::1]/") + self.assertRaises(URLParseError, URL.from_text, "http://[::1]]/") + + # empty port + self.assertRaises(URLParseError, URL.from_text, "http://127.0.0.1:") + # non-integer port + self.assertRaises(URLParseError, URL.from_text, "http://127.0.0.1:hi") + # extra port colon (makes for an invalid host) + self.assertRaises(URLParseError, URL.from_text, "http://127.0.0.1::80") + + def test_normalize(self): + # type: () -> None + url = URL.from_text("HTTP://Example.com/A%61/./../A%61?B%62=C%63#D%64") + assert url.get("Bb") == [] + assert url.get("B%62") == ["C%63"] + assert len(url.path) == 4 + + # test that most expected normalizations happen + norm_url = url.normalize() + + assert norm_url.scheme == "http" + assert norm_url.host == "example.com" + assert norm_url.path == ("Aa",) + assert norm_url.get("Bb") == ["Cc"] + assert norm_url.fragment == "Dd" + assert norm_url.to_text() == "http://example.com/Aa?Bb=Cc#Dd" + + # test that flags work + noop_norm_url = url.normalize( + scheme=False, host=False, path=False, query=False, fragment=False + ) + assert noop_norm_url == url + + # test that empty paths get at least one slash + slashless_url = URL.from_text("http://example.io") + slashful_url = slashless_url.normalize() + assert slashful_url.to_text() == "http://example.io/" + + # test case normalization for percent encoding + delimited_url = URL.from_text("/a%2fb/cd%3f?k%3d=v%23#test") + norm_delimited_url = delimited_url.normalize() + assert norm_delimited_url.to_text() == "/a%2Fb/cd%3F?k%3D=v%23#test" + + # test invalid percent encoding during normalize + assert ( + URL(path=("", "%te%sts")).normalize(percents=False).to_text() + == "/%te%sts" + ) + assert URL(path=("", "%te%sts")).normalize().to_text() == "/%25te%25sts" + + percenty_url = URL( + scheme="ftp", + path=["%%%", "%a%b"], + query=[("%", "%%")], + fragment="%", + userinfo="%:%", + ) + + assert ( + percenty_url.to_text(with_password=True) + == "ftp://%:%@/%%%/%a%b?%=%%#%" + ) + assert ( + percenty_url.normalize().to_text(with_password=True) + == "ftp://%25:%25@/%25%25%25/%25a%25b?%25=%25%25#%25" + ) + + def test_str(self): + # type: () -> None + + # see also issue #49 + text = "http://example.com/á/y%20a%20y/?b=%25" + url = URL.from_text(text) + assert unicode(url) == text + assert bytes(url) == b"http://example.com/%C3%A1/y%20a%20y/?b=%25" + + if PY2: + assert isinstance(str(url), bytes) + assert isinstance(unicode(url), unicode) + else: + assert isinstance(str(url), unicode) + assert isinstance(bytes(url), bytes) + + def test_idna_corners(self): + # type: () -> None + url = URL.from_text("http://abé.com/") + assert url.to_iri().host == "abé.com" + assert url.to_uri().host == "xn--ab-cja.com" + + url = URL.from_text("http://ドメイン.テスト.co.jp#test") + assert url.to_iri().host == "ドメイン.テスト.co.jp" + assert url.to_uri().host == "xn--eckwd4c7c.xn--zckzah.co.jp" + + assert url.to_uri().get_decoded_url().host == "ドメイン.テスト.co.jp" + + text = "http://Example.com" + assert ( + URL.from_text(text).to_uri().get_decoded_url().host == "example.com" + ) diff --git a/tox.ini b/tox.ini index ef2ec9c5..2165a835 100644 --- a/tox.ini +++ b/tox.ini @@ -1,26 +1,395 @@ [tox] -envlist = py26,py27,py34,py35,py36,pypy,coverage-report,packaging + +envlist = + flake8, black, mypy + test-py{26,27,34,35,36,37,38,39,py2,py3} + coverage_report + docs + packaging + +skip_missing_interpreters = {tty:True:False} + + +[default] + +basepython = python3.9 + +deps = + idna==2.9 # rq.filter: <3 + +setenv = + PY_MODULE=hyperlink + + PYTHONPYCACHEPREFIX={envtmpdir}/pycache + + +## +# Default environment: unit tests +## [testenv] -changedir = .tox -deps = -rrequirements-test.txt -commands = coverage run --parallel --rcfile {toxinidir}/.tox-coveragerc -m pytest --doctest-modules {envsitepackagesdir}/hyperlink {posargs} -# Uses default basepython otherwise reporting doesn't work on Travis where -# Python 3.6 is only available in 3.6 jobs. -[testenv:coverage-report] -changedir = .tox -deps = coverage -commands = coverage combine --rcfile {toxinidir}/.tox-coveragerc - coverage report --rcfile {toxinidir}/.tox-coveragerc - coverage html --rcfile {toxinidir}/.tox-coveragerc -d {toxinidir}/htmlcov +description = run tests + +basepython = + py: python + + py26: python2.6 + py27: python2.7 + py34: python3.4 + py35: python3.5 + py36: python3.6 + py37: python3.7 + py38: python3.8 + py39: python3.9 + py310: python3.10 + py311: python3.11 + py312: python3.12 + py313: python3.13 + + pypy2: pypy + pypy3: pypy3 + +deps = + {[default]deps} + + # In Python 2, we need to pull in typing, mock + py{26,27,py2}: typing==3.10.0.0 + py{26,27,py2}: mock==3.0.5 # rq.filter: <4 + + # For pytest + py{26,27,34,py2}: pytest==4.6.11 # rq.filter: <5 + py{35,36,37,38,39,310,311,312,313,py3}: pytest==5.2.4 + + # For code coverage + {[testenv:coverage_report]deps} + py{26,27,34,py2}: pytest-cov==2.8.1 # rq.filter: <2.9 + py{35,36,37,38,39,310,311,312,313,py3}: pytest-cov==2.10.1 + + # For hypothesis. Note Python 3.4 isn't supported by hypothesis. + py{26,27,py2}: hypothesis==4.43.9 # rq.filter: <4.44 + py{35,36,37,38,39,310,311,312,313,py3}: hypothesis==5.8.6 + +setenv = + {[default]setenv} + + COVERAGE_FILE={toxworkdir}/coverage.{envname} + HYPOTHESIS_STORAGE_DIRECTORY={toxworkdir}/hypothesis + +passenv = CI + +commands = + pytest --cov={env:PY_MODULE} --cov-report=term-missing:skip-covered --doctest-modules {posargs:src/{env:PY_MODULE}} + coverage_xml: coverage xml + + +## +# Black code formatting +## + +[testenv:black] + +description = run Black (linter) + +basepython = {[default]basepython} + +skip_install = True + +deps = + black==21.7b0 + +setenv = + BLACK_LINT_ARGS=--check + +commands = + black {env:BLACK_LINT_ARGS:} {posargs:setup.py src} + + +[testenv:black-reformat] + +description = {[testenv:black]description} and reformat +basepython = {[testenv:black]basepython} +skip_install = {[testenv:black]skip_install} +deps = {[testenv:black]deps} +commands = {[testenv:black]commands} + + +## +# Flake8 linting +## + +[testenv:flake8] + +description = run Flake8 (linter) + +basepython = {[default]basepython} + +skip_install = True + +deps = + flake8-bugbear==21.4.3 + flake8==3.9.2 + mccabe==0.6.1 + pep8-naming==0.12.1 + pycodestyle==2.7.0 + pydocstyle==6.1.1 + pyflakes==2.3.1 + +commands = + flake8 {posargs:setup.py src/{env:PY_MODULE}} + + +[flake8] + +# !!! BRING THE PAIN !!! +select = A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z + +show-source = True +doctests = True + +max-line-length = 80 + +# Codes: http://flake8.pycqa.org/en/latest/user/error-codes.html +ignore = + # syntax error in type comment + F723, + + # function name should be lowercase + N802, + + # argument name should be lowercase + N803, + + # variable in function should be lowercase + N806, + + # variable in class scope should not be mixedCase + N815, + + # variable in global scope should not be mixedCase + N816, + + # line break before binary operator + W503, + + # End of list (allows last item to end with trailing ',') + EOL + +# flake8-import-order: local module name space +application-import-names = deploy + + +## +# Mypy static type checking +## + +[testenv:mypy] + +description = run Mypy (static type checker) + +basepython = {[default]basepython} + +deps = + mypy==0.910 + types-mock==0.1.5 + + {[default]deps} +commands = + mypy \ + --config-file="{toxinidir}/tox.ini" \ + --cache-dir="{toxworkdir}/mypy_cache" \ + {tty:--pretty:} \ + {posargs:src} + + +[mypy] + +# Global settings + +check_untyped_defs = True +disallow_any_generics = True +disallow_incomplete_defs = True +disallow_untyped_defs = True +no_implicit_optional = True +show_column_numbers = True +show_error_codes = True +strict_optional = True +warn_no_return = True +warn_redundant_casts = True +warn_return_any = True +warn_unreachable = True +warn_unused_ignores = True + +# DrawCallable is generic + +[mypy-hyperlink.hypothesis] +disallow_any_generics = False +[mypy-hyperlink.test.test_hypothesis] +disallow_any_generics = False + +# Don't complain about dependencies known to lack type hints + +[mypy-hypothesis] +ignore_missing_imports = True +[mypy-hypothesis.*] +ignore_missing_imports = True + +[mypy-idna] +ignore_missing_imports = True + + +## +# Coverage report +## + +[testenv:coverage_report] + +description = generate coverage report + +depends = test-py{26,27,34,35,36,37,38,39,310,311,312,313,py2,py3} + +basepython = {[default]basepython} + +skip_install = True + +deps = + # coverage 5.0 drops Python 3.4 support + coverage==4.5.4 # rq.filter: <5 + +setenv = + {[default]setenv} + + COVERAGE_FILE={toxworkdir}/coverage + +commands = + coverage combine + - coverage report + - coverage html + + +## +# Codecov +## + +[testenv:codecov] + +description = upload coverage to Codecov + +depends = {[coverage_report]depends} + +basepython = python + +skip_install = True + +deps = + {[testenv:coverage_report]deps} + codecov==2.1.12 + +passenv = + # See https://github.com/codecov/codecov-python/blob/master/README.md#using-tox + # And CI-specific docs: + # https://help.github.com/en/articles/virtual-environments-for-github-actions#default-environment-variables + # https://docs.travis-ci.com/user/environment-variables#default-environment-variables + # https://www.appveyor.com/docs/environment-variables/ + TOXENV CODECOV_* CI + GITHUB_* + TRAVIS TRAVIS_* + APPVEYOR APPVEYOR_* + +setenv = + {[testenv:coverage_report]setenv} + + COVERAGE_XML={envlogdir}/coverage.xml + +commands = + # Note documentation for CI variables in passenv above + coverage combine + coverage xml -o "{env:COVERAGE_XML}" + codecov --file="{env:COVERAGE_XML}" --env \ + GITHUB_REF GITHUB_COMMIT GITHUB_USER GITHUB_WORKFLOW \ + TRAVIS_BRANCH TRAVIS_BUILD_WEB_URL \ + TRAVIS_COMMIT TRAVIS_COMMIT_MESSAGE \ + APPVEYOR_REPO_BRANCH APPVEYOR_REPO_COMMIT \ + APPVEYOR_REPO_COMMIT_AUTHOR_EMAIL \ + APPVEYOR_REPO_COMMIT_MESSAGE_EXTENDED + + +## +# Documentation +## + +[testenv:docs] + +description = build documentation + +basepython = {[default]basepython} + +deps = + Sphinx==4.1.2 + sphinx-rtd-theme==0.5.2 + +commands = + sphinx-build \ + -b html -d "{envtmpdir}/doctrees" \ + "{toxinidir}/docs" \ + "{toxinidir}/htmldocs" + + +[testenv:docs-auto] + +description = build documentation and rebuild automatically + +basepython = {[default]basepython} + +deps = + {[testenv:docs]deps} + sphinx-autobuild==2021.3.14 + +commands = + sphinx-autobuild \ + -b html -d "{envtmpdir}/doctrees" \ + --host=localhost \ + "{toxinidir}/docs" \ + "{toxinidir}/htmldocs" + + +## +# Packaging +## [testenv:packaging] -changedir = {toxinidir} + +description = check for potential packaging problems + +basepython = {[default]basepython} + +skip_install = True + deps = - check-manifest==0.35 - readme_renderer==17.2 + check-manifest==0.46 + readme-renderer==29.0 + twine==3.4.2 + commands = check-manifest - python setup.py check --metadata --restructuredtext --strict + pip wheel --wheel-dir "{envtmpdir}/dist" --no-deps {toxinidir} + twine check "{envtmpdir}/dist/"* + + +## +# Print dependencies +## + +[testenv:dependencies] + +description = print dependencies + +basepython = {[default]basepython} + +recreate = true + +deps = + +commands = + pip freeze --exclude={env:PY_MODULE}