From 23242160168fabf50c26ee1124d72ee8ab971327 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sat, 11 Jul 2020 21:11:54 +0000 Subject: [PATCH] Add baseUrl and evaluate incomplete URLs relative to it --- qwarc/__init__.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/qwarc/__init__.py b/qwarc/__init__.py index c501765..34b2316 100644 --- a/qwarc/__init__.py +++ b/qwarc/__init__.py @@ -29,11 +29,26 @@ class Item: self.session = session self.headers = headers self.warc = warc + if not hasattr(self, '_baseUrl'): # To allow subclasses to set the baseUrl before calling super().__init__ + self._baseUrl = None self.stats = {'tx': 0, 'rx': 0, 'requests': 0} self.logger = logging.LoggerAdapter(logging.getLogger(), {'itemType': self.itemType, 'itemValue': self.itemValue}) self.childItems = [] + @property + def baseUrl(self): + return self._baseUrl + + @baseUrl.setter + def baseUrl(self, baseUrl): + if baseUrl is None: + self._baseUrl = None + elif isinstance(baseUrl, yarl.URL): + self._baseUrl = baseUrl + else: + self._baseUrl = yarl.URL(baseUrl) + def _merge_headers(self, headers): d = {} # Preserves order from Python 3.7 (guaranteed) or CPython 3.6 (implementation detail) keys = {} # casefolded key -> d key @@ -64,7 +79,7 @@ class Item: ''' HTTP GET or POST a URL - url: str or yarl.URL + url: str or yarl.URL; if this is not a complete URL, it is evaluated relative to self.baseUrl responseHandler: None or a callable that determines how the response is handled; if None, self.defaultResponseHandler is used. See qwarc.utils.handle_response_default for details. method: str, must be 'GET' or 'POST' data: dict or list/tuple of lists/tuples of length two or bytes or file-like or None, the data to be sent in the request body @@ -80,6 +95,10 @@ class Item: #TODO: Rewrite using 'async with self.session.get' url = yarl.URL(url) # Explicitly convert for normalisation, percent-encoding, etc. + if not url.scheme or not url.host: + if not self.baseUrl: + raise ValueError('Incomplete URL and no baseUrl to join it with') + url = self.baseUrl.join(url) if responseHandler is None: responseHandler = self.defaultResponseHandler assert method in ('GET', 'POST'), 'method must be GET or POST'