diff --git a/apod/utility.py b/apod/utility.py index 302801f..3f8aa66 100644 --- a/apod/utility.py +++ b/apod/utility.py @@ -13,8 +13,9 @@ import re import requests -import urllib3 from bs4 import BeautifulSoup +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry # import urllib.request @@ -24,8 +25,38 @@ # location of backing APOD service BASE = "https://apod.nasa.gov/apod/" -# Create urllib3 Pool Manager -http = urllib3.PoolManager() +# apod.nasa.gov intermittently hangs or returns 5xx errors; without a timeout +# a stuck upstream connection holds a worker indefinitely, and without retries +# a single transient failure becomes a hard error for the client. +# Hung connections are bounded so the request cycle fits within gunicorn's +# default 30s worker timeout: connect hangs cost at most ~16s (3 attempts of +# 5s plus backoff) and read hangs at most ~26s (2 attempts of 13s). Transient +# 5xx responses normally arrive quickly, so retrying them adds little latency. +CONNECT_TIMEOUT = 5 # seconds to establish the upstream connection +READ_TIMEOUT = 8 # seconds to wait for the upstream response +REQUEST_TIMEOUT = (CONNECT_TIMEOUT, READ_TIMEOUT) + +# Shared session so outbound requests reuse pooled connections and retry +# transient upstream failures with exponential backoff. Retry-After headers +# are deliberately ignored: honoring a long server-specified delay on a +# 429/503 would sleep the worker past its timeout. raise_on_status=False +# hands the final response back so the caller decides how to surface it. +session = requests.Session() +session.mount( + "https://", + HTTPAdapter( + max_retries=Retry( + total=3, + connect=2, + read=1, + backoff_factor=0.5, + status_forcelist=[429, 500, 502, 503, 504], + allowed_methods=["GET"], + raise_on_status=False, + respect_retry_after_header=False, + ) + ), +) # function for decoding response text into utf-8 or utf-16 def _decode_response_text(res): @@ -68,10 +99,12 @@ def _get_thumbs(data): vimeo_id_regex = re.compile(r"(?:/video/)(\d+)") vimeo_id = vimeo_id_regex.findall(data)[0] # make an API call to get thumbnail URL - vimeo_request = http.request( - "GET", "https://vimeo.com/api/v2/video/" + vimeo_id + ".json" + vimeo_request = session.get( + "https://vimeo.com/api/v2/video/" + vimeo_id + ".json", + timeout=REQUEST_TIMEOUT, ) - data = json.loads(vimeo_request.data.decode("utf-8")) + vimeo_request.raise_for_status() + data = vimeo_request.json() video_thumb = data[0]["thumbnail_large"] else: # the thumbs parameter is True, but the APOD for the date is not a video, output nothing @@ -94,8 +127,7 @@ def _get_apod_chars(dt, thumbs): else: apod_url = "%sastropix.html" % BASE LOG.debug("OPENING URL:" + apod_url) - res = requests.get(apod_url) - page_text = _decode_response_text(res) + res = session.get(apod_url, timeout=REQUEST_TIMEOUT) if res.status_code == 404: return None # LOG.error(f'No APOD entry for URL: {apod_url}') @@ -108,6 +140,11 @@ def _get_apod_chars(dt, thumbs): # return default_obj_props + # surface upstream failures that persist after retries as a clear error + # instead of attempting to parse an error page + res.raise_for_status() + page_text = _decode_response_text(res) + soup = BeautifulSoup(page_text, "html.parser") LOG.debug("getting the data url") hd_data = None diff --git a/tests/apod/test_http.py b/tests/apod/test_http.py new file mode 100644 index 0000000..ecaaecf --- /dev/null +++ b/tests/apod/test_http.py @@ -0,0 +1,124 @@ +# coding= utf-8 +import unittest +from datetime import datetime +from unittest.mock import MagicMock, patch + +import requests + +from apod import utility + + +def _mock_response(status_code=200, content=b"", apparent_encoding="utf-8"): + res = MagicMock(spec=requests.Response) + res.status_code = status_code + res.content = content + res.text = content.decode("utf-8", errors="replace") + res.apparent_encoding = apparent_encoding + if status_code >= 400: + res.raise_for_status.side_effect = requests.HTTPError( + "%s Error" % status_code, response=res + ) + else: + res.raise_for_status.return_value = None + return res + + +class TestOutboundRequests(unittest.TestCase): + """Offline tests for timeout/retry hardening of requests made to the + backing APOD service. Regression tests for connection timeout issues + (issue #163).""" + + def test_apod_page_request_uses_timeout(self): + """The request for the APOD page must carry an explicit timeout so a + hung upstream connection cannot hold a worker indefinitely.""" + with patch.object(utility, "session") as mock_session: + mock_session.get.return_value = _mock_response(status_code=404) + result = utility._get_apod_chars(datetime(2020, 1, 1), thumbs=False) + + self.assertIsNone(result) + _, kwargs = mock_session.get.call_args + self.assertEqual(kwargs.get("timeout"), utility.REQUEST_TIMEOUT) + + def test_missing_page_returns_none(self): + """A 404 from the backing service must still map to None (which the + application layer turns into a 404 response).""" + with patch.object(utility, "session") as mock_session: + mock_session.get.return_value = _mock_response(status_code=404) + self.assertIsNone( + utility._get_apod_chars(datetime(2020, 1, 1), thumbs=False) + ) + + def test_persistent_upstream_error_raises(self): + """A 5xx that survives the retries must raise instead of handing an + upstream error page to the parser.""" + with patch.object(utility, "session") as mock_session: + mock_session.get.return_value = _mock_response(status_code=503) + with self.assertRaises(requests.HTTPError): + utility._get_apod_chars(datetime(2020, 1, 1), thumbs=False) + + def test_session_is_configured_to_retry_transient_failures(self): + """The shared session must retry transient upstream failures with + backoff before giving up.""" + adapter = utility.session.get_adapter(utility.BASE) + retries = adapter.max_retries + + self.assertGreaterEqual(retries.total, 1) + self.assertGreater(retries.backoff_factor, 0) + for status in (500, 502, 503, 504): + self.assertIn(status, retries.status_forcelist) + self.assertIn("GET", retries.allowed_methods) + # raise_on_status=False is load-bearing: it makes the session hand the + # final 5xx response back so raise_for_status() in _get_apod_chars is + # what surfaces the error (instead of urllib3 raising RetryError). + self.assertFalse(retries.raise_on_status) + # Retry-After must be ignored: a server-specified delay on a 429/503 + # would otherwise sleep the worker past gunicorn's timeout. + self.assertFalse(retries.respect_retry_after_header) + + def test_hung_connection_budget_fits_worker_timeout(self): + """Worst-case wall-clock for hung upstream connections must stay + below gunicorn's default 30s worker timeout, otherwise the worker is + killed mid-retry and the client sees a dropped connection.""" + retries = utility.session.get_adapter(utility.BASE).max_retries + + def backoff_sleeps(num_retries): + # urllib3 2.x sleeps 0 before the first retry, then + # backoff_factor * 2^(n-1) before retry n + return sum( + retries.backoff_factor * (2**n) for n in range(1, num_retries) + ) + + connect_attempts = min(retries.connect, retries.total) + 1 + worst_connect_hang = ( + connect_attempts * utility.CONNECT_TIMEOUT + + backoff_sleeps(connect_attempts - 1) + ) + self.assertLess(worst_connect_hang, 30) + + read_attempts = min(retries.read, retries.total) + 1 + worst_read_hang = read_attempts * ( + utility.CONNECT_TIMEOUT + utility.READ_TIMEOUT + ) + backoff_sleeps(read_attempts - 1) + self.assertLess(worst_read_hang, 30) + + def test_vimeo_thumbnail_request_uses_timeout(self): + """The Vimeo thumbnail lookup must also carry an explicit timeout.""" + vimeo_response = _mock_response() + vimeo_response.json.return_value = [ + {"thumbnail_large": "https://i.vimeocdn.com/video/foo_640.jpg"} + ] + with patch.object(utility, "session") as mock_session: + mock_session.get.return_value = vimeo_response + thumb = utility._get_thumbs("https://player.vimeo.com/video/12345") + + self.assertEqual(thumb, "https://i.vimeocdn.com/video/foo_640.jpg") + _, kwargs = mock_session.get.call_args + self.assertEqual(kwargs.get("timeout"), utility.REQUEST_TIMEOUT) + + def test_vimeo_thumbnail_upstream_error_raises(self): + """A persistent upstream error from the Vimeo API must surface as a + clear HTTPError instead of a JSON parsing failure on an error page.""" + with patch.object(utility, "session") as mock_session: + mock_session.get.return_value = _mock_response(status_code=503) + with self.assertRaises(requests.HTTPError): + utility._get_thumbs("https://player.vimeo.com/video/12345") diff --git a/tests/apod/test_service.py b/tests/apod/test_service.py index 67301ac..e11408c 100644 --- a/tests/apod/test_service.py +++ b/tests/apod/test_service.py @@ -1,4 +1,3 @@ -#!/bin/sh/python # coding= utf-8 import logging import unittest diff --git a/tests/apod/test_utility.py b/tests/apod/test_utility.py index 3edc2f0..39854f2 100644 --- a/tests/apod/test_utility.py +++ b/tests/apod/test_utility.py @@ -1,4 +1,3 @@ -#!/bin/sh/python # coding= utf-8 import logging import unittest