diff --git a/.gitignore b/.gitignore index 8ecd869..7452234 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ __pycache__/ .DS_Store .venv dist -.ruff_cache \ No newline at end of file +.ruff_cache +*.mbtiles \ No newline at end of file diff --git a/README.md b/README.md index 11e776d..c242dd6 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,27 @@ # tileget -![GitHub Release](https://img.shields.io/github/v/release/Kanahiro/tileget?label=PyPI) +![GitHub Release](https://img.shields.io/github/v/release/Kanahiro/tileget) ![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/Kanahiro/tileget/lint.yml?label=lint) Tile download utility - easily download xyz-tile data. -## installation +## Requirements + +- Python >= 3.14 + +## Installation ```sh pip install tileget ``` -## usage +## Usage -```planetext -usage: __main__.py [-h] [-e OUTPUT_DIR] [-o OUTPUT_FILE] [--extent EXTENT EXTENT EXTENT EXTENT] [--geojson GEOJSON] [--minzoom MINZOOM] [--maxzoom MAXZOOM] [--interval INTERVAL] [--overwrite] [--timeout TIMEOUT] [--tms] - tileurl +```plaintext +usage: tileget [-h] [-e OUTPUT_DIR] [-o OUTPUT_FILE] [--extent EXTENT EXTENT EXTENT EXTENT] + [--geojson GEOJSON] [--minzoom MINZOOM] [--maxzoom MAXZOOM] [--rps RPS] [--overwrite] + [--timeout TIMEOUT] [--tms] [--retries RETRIES] [--retry-delay RETRY_DELAY] + tileurl xyz-tile download tool @@ -24,28 +30,31 @@ positional arguments: options: -h, --help show this help message and exit - -e OUTPUT_DIR, --output_dir OUTPUT_DIR + -e, --output_dir OUTPUT_DIR output dir - -o OUTPUT_FILE, --output_file OUTPUT_FILE + -o, --output_file OUTPUT_FILE output mbtiles file --extent EXTENT EXTENT EXTENT EXTENT min_lon min_lat max_lon max_lat, whitespace delimited --geojson GEOJSON path to geojson file of Feature or FeatureCollection --minzoom MINZOOM default to 0 --maxzoom MAXZOOM default to 16 - --interval INTERVAL time taken after each-request, set as miliseconds in interger, default to 500 + --rps RPS requests per second, must be positive, default to 1 --overwrite overwrite existing files - --timeout TIMEOUT wait response until this value, set as miliseconds in integer, default to 5000 + --timeout TIMEOUT wait response until this value in seconds, default to 5.0 --tms if set, parse z/x/y as TMS + --retries RETRIES max retry count on error, default to 3 + --retry-delay RETRY_DELAY + base delay in seconds for exponential backoff, default to 1.0 ``` -### examples +### Examples ```sh # basic usage tileget http://path/to/tile/{z}/{x}/{y}.jpg -e output_dir --extent 141.23 40.56 142.45 43.78 tileget http://path/to/tile/{z}/{x}/{y}.jpg -o output.mbtiles --geojson input.geojson -# optional arguments -tileget http://path/to/tile/{z}/{x}/{y}.jpg -e output_dir --extent 141.23 40.56 142.45 43.78 --minzoom 0 --maxzoom 16 --interval 500 --timeout 5000 --overwrite +# with rate limiting and retry options +tileget http://path/to/tile/{z}/{x}/{y}.jpg -e output_dir --extent 141.23 40.56 142.45 43.78 --minzoom 0 --maxzoom 16 --rps 5 --retries 5 --retry-delay 2.0 --timeout 10 --overwrite ``` diff --git a/pyproject.toml b/pyproject.toml index 5a9315a..e61f2a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "tileget" -version = "0.4.3" +version = "1.0.0" description = "Tile download utility - easily download xyz-tile data" readme = "README.md" requires-python = ">= 3.14" diff --git a/tileget/__main__.py b/tileget/__main__.py index 4aa1c28..6ce1a2f 100644 --- a/tileget/__main__.py +++ b/tileget/__main__.py @@ -1,110 +1,181 @@ import asyncio import os +import random +import signal import sqlite3 +import time import httpx import tiletanic from tileget.arg import parse_arg +# ダウンロード速度を計測するためのグローバル変数 +downloaded_count = 0 +start_time = 0.0 + +# グレースフルシャットダウン用フラグ +shutdown_requested = False + + +class RateLimiter: + def __init__(self, rps: int): + self.rps = rps + self.interval = 1.0 / rps + self.last_request_time = 0.0 + self.lock = asyncio.Lock() + + async def acquire(self) -> bool: + """レートリミットを取得。シャットダウン時はFalseを返す""" + if shutdown_requested: + return False + + async with self.lock: + if shutdown_requested: + return False + + now = time.monotonic() + wait_time = self.last_request_time + self.interval - now + if wait_time > 0: + await asyncio.sleep(wait_time) + if shutdown_requested: + return False + + self.last_request_time = time.monotonic() + return True + + +def is_retryable_error(e: Exception) -> bool: + if isinstance(e, httpx.TimeoutException): + return True + if isinstance(e, httpx.HTTPStatusError): + return e.response.status_code >= 500 or e.response.status_code == 429 + return False + async def fetch_data( - client: httpx.AsyncClient, url: str, timeout: int = 5000 + client: httpx.AsyncClient, + url: str, + timeout: float, + retries: int, + retry_delay: float, ) -> bytes | None: - print("downloading: " + url) - try: - response = await client.get(url, timeout=timeout / 1000) - response.raise_for_status() - return response.content - except httpx.HTTPStatusError as e: - print(f"{e.response.status_code}: {url}") - return None - except httpx.TimeoutException: - print(f"timeout: {url}") - return None - except Exception as e: - print(f"{e}: {url}") - return None + global downloaded_count + + for attempt in range(retries + 1): + try: + response = await client.get(url, timeout=timeout) + response.raise_for_status() + downloaded_count += 1 + elapsed = time.monotonic() - start_time + speed = downloaded_count / elapsed if elapsed > 0 else 0 + print(f"{downloaded_count} tiles ({speed:.1f} tiles/s): {url}") + return response.content + except Exception as e: + if not is_retryable_error(e) or attempt == retries: + if isinstance(e, httpx.HTTPStatusError): + print(f"{e.response.status_code}: {url}") + elif isinstance(e, httpx.TimeoutException): + print(f"timeout: {url}") + else: + print(f"{e}: {url}") + return None + + delay = retry_delay * (2**attempt) + random.uniform(0, 1) + print(f"retry {attempt + 1}/{retries} after {delay:.1f}s: {url}") + await asyncio.sleep(delay) + if shutdown_requested: + return None + + return None async def download_dir( client: httpx.AsyncClient, - semaphore: asyncio.Semaphore, + rate_limiter: RateLimiter, tile: tiletanic.Tile, tileurl: str, output_path: str, - timeout: int = 5000, - overwrite: bool = False, + timeout: float, + overwrite: bool, + retries: int, + retry_delay: float, ): - async with semaphore: - ext = os.path.splitext(tileurl.split("?")[0])[-1] + ext = os.path.splitext(tileurl.split("?")[0])[-1] - write_dir = os.path.join(output_path, str(tile.z), str(tile.x)) - write_filepath = os.path.join(write_dir, str(tile.y) + ext) + write_dir = os.path.join(output_path, str(tile.z), str(tile.x)) + write_filepath = os.path.join(write_dir, str(tile.y) + ext) - if os.path.exists(write_filepath) and not overwrite: - return + if os.path.exists(write_filepath) and not overwrite: + return - url = ( - tileurl.replace(r"{x}", str(tile.x)) - .replace(r"{y}", str(tile.y)) - .replace(r"{z}", str(tile.z)) - ) + if not await rate_limiter.acquire(): + return + + url = ( + tileurl.replace(r"{x}", str(tile.x)) + .replace(r"{y}", str(tile.y)) + .replace(r"{z}", str(tile.z)) + ) - data = await fetch_data(client, url, timeout) - if data is None: - return + data = await fetch_data(client, url, timeout, retries, retry_delay) + if data is None: + return - os.makedirs(write_dir, exist_ok=True) - with open(write_filepath, mode="wb") as f: - f.write(data) + os.makedirs(write_dir, exist_ok=True) + with open(write_filepath, mode="wb") as f: + f.write(data) async def download_mbtiles( client: httpx.AsyncClient, - semaphore: asyncio.Semaphore, + rate_limiter: RateLimiter, conn: sqlite3.Connection, tile: tiletanic.Tile, tileurl: str, - timeout: int = 5000, - overwrite: bool = False, - tms: bool = False, + timeout: float, + overwrite: bool, + tms: bool, + retries: int, + retry_delay: float, ): - async with semaphore: - if tms: - ty = tile.y - else: - ty = (1 << tile.z) - 1 - tile.y + if tms: + ty = tile.y + else: + ty = (1 << tile.z) - 1 - tile.y - c = conn.cursor() - c.execute( - "SELECT tile_data FROM tiles WHERE zoom_level = ? AND tile_column = ? AND tile_row = ?", - (tile.z, tile.x, ty), - ) - if c.fetchone() is not None and not overwrite: - return + c = conn.cursor() + c.execute( + "SELECT tile_data FROM tiles WHERE zoom_level = ? AND tile_column = ? AND tile_row = ?", + (tile.z, tile.x, ty), + ) + if c.fetchone() is not None and not overwrite: + return - url = ( - tileurl.replace(r"{x}", str(tile.x)) - .replace(r"{y}", str(tile.y)) - .replace(r"{z}", str(tile.z)) - ) + if not await rate_limiter.acquire(): + return - data = await fetch_data(client, url, timeout) - if data is None: - return + url = ( + tileurl.replace(r"{x}", str(tile.x)) + .replace(r"{y}", str(tile.y)) + .replace(r"{z}", str(tile.z)) + ) - if overwrite: - c.execute( - "DELETE FROM tiles WHERE zoom_level = ? AND tile_column = ? AND tile_row = ?", - (tile.z, tile.x, ty), - ) + data = await fetch_data(client, url, timeout, retries, retry_delay) + if data is None: + return + if overwrite: c.execute( - "INSERT INTO tiles (zoom_level, tile_column, tile_row, tile_data) VALUES (?, ?, ?, ?)", - (tile.z, tile.x, ty, data), + "DELETE FROM tiles WHERE zoom_level = ? AND tile_column = ? AND tile_row = ?", + (tile.z, tile.x, ty), ) - conn.commit() + + c.execute( + "INSERT INTO tiles (zoom_level, tile_column, tile_row, tile_data) VALUES (?, ?, ?, ?)", + (tile.z, tile.x, ty, data), + ) + conn.commit() def create_mbtiles(output_file: str): @@ -141,10 +212,23 @@ def create_mbtiles(output_file: str): async def run(): + global start_time, shutdown_requested params = parse_arg() + start_time = time.monotonic() - concurrency = max(1, 1000 // params.interval) - semaphore = asyncio.Semaphore(concurrency) + # SIGINTハンドラを設定 + loop = asyncio.get_running_loop() + + def handle_sigint(): + global shutdown_requested + if not shutdown_requested: + shutdown_requested = True + print("\nShutdown requested. Waiting for running tasks to complete...") + + loop.add_signal_handler(signal.SIGINT, handle_sigint) + loop.add_signal_handler(signal.SIGTERM, handle_sigint) + + rate_limiter = RateLimiter(params.rps) conn = None if params.mode == "mbtiles": @@ -183,45 +267,64 @@ async def run(): async with httpx.AsyncClient() as client: for zoom in range(params.minzoom, params.maxzoom + 1): - tiles = list( - tiletanic.tilecover.cover_geometry(tilescheme, params.geometry, zoom) + if shutdown_requested: + break + + tiles = tiletanic.tilecover.cover_geometry( + tilescheme, params.geometry, zoom ) - if params.mode == "dir": - tasks = [ - download_dir( - client, - semaphore, - tile, - params.tileurl, - params.output_path, - params.timeout, - params.overwrite, + # TaskGroupの代わりに手動でタスクを管理 + pending_tasks: set[asyncio.Task] = set() + + for tile in tiles: + if shutdown_requested: + break + + if params.mode == "dir": + task = asyncio.create_task( + download_dir( + client, + rate_limiter, + tile, + params.tileurl, + params.output_path, + params.timeout, + params.overwrite, + params.retries, + params.retry_delay, + ) ) - for tile in tiles - ] - else: - assert conn is not None - tasks = [ - download_mbtiles( - client, - semaphore, - conn, - tile, - params.tileurl, - params.timeout, - params.overwrite, - params.tms, + else: + assert conn is not None + task = asyncio.create_task( + download_mbtiles( + client, + rate_limiter, + conn, + tile, + params.tileurl, + params.timeout, + params.overwrite, + params.tms, + params.retries, + params.retry_delay, + ) ) - for tile in tiles - ] + pending_tasks.add(task) + task.add_done_callback(pending_tasks.discard) - await asyncio.gather(*tasks) + # 残っているタスクの完了を待つ + if pending_tasks: + await asyncio.gather(*pending_tasks, return_exceptions=True) if conn is not None: conn.close() - print("finished") + if shutdown_requested: + print("Shutdown complete.") + else: + print("finished") def main(): diff --git a/tileget/arg.py b/tileget/arg.py index a6e3a1d..02212a2 100644 --- a/tileget/arg.py +++ b/tileget/arg.py @@ -13,12 +13,14 @@ class RunParams: mode: Literal["dir", "mbtiles"] output_path: str geometry: shapely.geometry.base.BaseGeometry - minzoom: int = 0 - maxzoom: int = 16 - interval: int = 1000 - overwrite: bool = False - timeout: int = 5000 - tms: bool = False + minzoom: int + maxzoom: int + rps: int + overwrite: bool + timeout: float + tms: bool + retries: int + retry_delay: float def parse_arg() -> RunParams: @@ -37,22 +39,40 @@ def parse_arg() -> RunParams: ) parser.add_argument("--minzoom", default=0, type=int, help="default to 0") parser.add_argument("--maxzoom", default=16, type=int, help="default to 16") + def positive_int(value: str) -> int: + ivalue = int(value) + if ivalue <= 0: + raise argparse.ArgumentTypeError("must be a positive integer") + return ivalue + parser.add_argument( - "--interval", - default=500, - type=int, - help="time taken after each-request, set as miliseconds in interger, default to 500", + "--rps", + default=1, + type=positive_int, + help="requests per second, must be positive, default to 1", ) parser.add_argument( "--overwrite", help="overwrite existing files", action="store_true" ) parser.add_argument( "--timeout", - default=5000, - type=int, - help="wait response until this value, set as miliseconds in integer, default to 5000", + default=5.0, + type=float, + help="wait response until this value in seconds, default to 5.0", ) parser.add_argument("--tms", help="if set, parse z/x/y as TMS", action="store_true") + parser.add_argument( + "--retries", + default=3, + type=int, + help="max retry count on error, default to 3", + ) + parser.add_argument( + "--retry-delay", + default=1.0, + type=float, + help="base delay in seconds for exponential backoff, default to 1.0", + ) args = parser.parse_args() if args.output_dir is None and args.output_file is None: @@ -105,10 +125,12 @@ def parse_arg() -> RunParams: geometry=geom_3857, minzoom=args.minzoom, maxzoom=args.maxzoom, - interval=args.interval, + rps=args.rps, overwrite=args.overwrite, timeout=args.timeout, tms=args.tms, + retries=args.retries, + retry_delay=args.retry_delay, ) return params diff --git a/uv.lock b/uv.lock index baa628e..633a92b 100644 --- a/uv.lock +++ b/uv.lock @@ -212,7 +212,7 @@ wheels = [ [[package]] name = "tileget" -version = "0.4.3" +version = "1.0.0" source = { editable = "." } dependencies = [ { name = "httpx" },