diff --git a/tools/links/links.config.php b/tools/links/links.config.php new file mode 100644 index 0000000000..366bc72a1e --- /dev/null +++ b/tools/links/links.config.php @@ -0,0 +1,202 @@ +includeName('*.md') + ->excludeName('new_in_doc.md')//TMP + ->excludeWholeName('./docs/release_notes/*')//TMP + ->excludeWholeName('./docs/snippets/*')//TMP + ->excludeWholeName('./docs/update_and_migration/*')//TMP + + //->includeName('*.html') // Memory issues + ->includeWholeName('./**/php_api_reference/index.html') // Should be enough + + ->find(), +); + +$resourceFiles = call_user_func(function (array $a): array { + asort($a); + return $a; +}, + // Images + (new Finder('./docs')) + ->includeName('*.png') + ->includeName('*.jpg') + ->excludeWholeName('./docs/release_notes/img/*')//TMP + ->excludeWholeName('./docs/update_and_migration/img/*')//TMP + ->excludeWholeName('./docs/api/php_api/php_api_reference/images/*')//TMP + ->find(), +); + +$exclusionTests = array_merge_recursive(UrlTester::getDefaultExclusionTests(), [ + 'url' => [ + function (string $url, ?string $file = null): bool { + // docs/index.md content is not Markdown but HTML with server URLs + return 'docs/index.md' === $file && str_starts_with($url, 'docs/'); + }, + function (string $url, ?string $file = null): bool { + // ibexa.co APIs needing authentication, namespaces, commercial aliases, etc. + return str_starts_with($url, 'https://updates.ibexa.co') // 401 + || str_starts_with($url, 'https://flex.ibexa.co') // 404 + || str_starts_with($url, 'https://support.ibexa.co') // 302 → /login + || str_starts_with($url, 'https://connect.ibexa.co') // 302 → https://ibexa.integromat.celonis.com/ + || str_starts_with($url, 'http://ibexa.co/namespaces/') // 301 + || str_starts_with($url, 'http://ibexa.co/xmlns/') // 301 + || str_starts_with($url, 'http://ez.no/namespaces/') //301 + //|| str_starts_with($url, 'http://ez.no/xmlns/') //301 + || str_starts_with($url, 'https://api.cloud.ibexa.co') // 301, PLATFORMSH_CLI_API_URL + //|| str_starts_with($url, 'https://admin.perso.ibexa.co/api/') // 400 + || str_starts_with($url, 'https://admin.perso.ibexa.co/') // 404 + || str_starts_with($url, 'https://event.perso.ibexa.co/api/') // 400 + || str_starts_with($url, 'https://event.perso.ibexa.co/ebl/') // 404 + || str_starts_with($url, 'https://import.perso.ibexa.co/api/') // 400 + || str_starts_with($url, 'https://reco.perso.ibexa.co') // 403 + || str_starts_with($url, 'https://admin.yoochoose.net') // 400 + || str_starts_with($url, 'https://event.yoochoose.net/api/') // 400 + || str_starts_with($url, 'https://event.yoochoose.net/ebl/') // 404 + || str_starts_with($url, 'https://event.yoochoose.net/') // 404 + || str_starts_with($url, 'https://reco.yoochoose.net') // 403 + || str_starts_with($url, 'https://tracker.ibexa.co/') // 999 Could not resolve host + ; + }, + function (string $url, ?string $file = null): bool { + // Third parties APIs, namespaces, etc. + return str_starts_with($url, 'https://api.fastly.com') // 301 or 403 + || str_starts_with($url, 'https://unsplash.com') // 405 + || str_starts_with($url, 'http://docbook.org/ns/') // 301 + || str_starts_with($url, 'http://schema.org/ListItem') // 301 → https + || str_starts_with($url, 'http://www.w3.org/1999/xlink') // 301 → https + || str_starts_with($url, 'https://www.google.com/recaptcha/admin') // 302 → https://accounts.google.com/ServiceLogin + ; + }, + function (string $url, ?string $file = null): bool { + return (bool)preg_match('@(https?:)?//([a-z]+\.)?(localhost|127.0.0.1|123.456.789.0)(:[0-9]+)?(/.*|$)@', $url); + }, + function (string $url, ?string $file = null): bool { + // Fake, placeholders, local servers, etc. + return str_contains($url, 'foobar.com') + || str_contains($url, 'mydomain.com') + || str_contains($url, '//my_site.com') + || str_contains($url, '//address.of/') + || str_contains($url, '//some/file/here') + || str_contains($url, '//mydoc.pdf') + || str_contains($url, 'var/ezdemo_site/') + || str_contains($url, '//server_uri') + || str_contains($url, '//FRA_server_uri') + || str_contains($url, '//user:password@host') + || str_contains($url, '//user:pass@localhost') + || str_contains($url, '//elasticsearch:9200') + || str_contains($url, '//solr:8983') + || str_contains($url, '//varnish:80') + || str_contains($url, '//my.varnish.server') + || str_contains($url, '//myuser:mypasswd@my.varnish.server') + || str_contains($url, 'platformsh.site') + ; + }, + function (string $url, ?string $file = null): bool { + return str_starts_with($url, '/assets/') + || str_contains($url, '{{ asset('); + }, + function (string $url, ?string $file = null): bool { + return false !== strpos($url, 'javascript:'); + }, + function (string $url, ?string $file = null): bool { + return str_contains($url, '{{ path(') + //|| str_contains($url, '{{ ez_path(') + || str_contains($url, '{{ ibexa_path(') + || str_contains($url, '{{ ibexa_url(') + //|| str_contains($url, "|e('html_attr')") + || str_contains($url, '{{ image_uri }}') + || str_contains($url, '{{ ibexa_checkout_step_path(') + || str_contains($url, '{{ ibexa_checkout_step_url(') + ; + }, + function (string $url, ?string $file = null): bool { + return str_ends_with($file, '/rest_api_authentication.md') + && str_ends_with($url, 'web+ez:DELETE /content/locations/1/2'); + }, + function (string $url, ?string $file = null): bool { + return str_ends_with($file, '/file_url_handling.md') + && (str_contains($url, 'http://`') || str_contains($url, 'ftp://`')); + }, + function (string $url, ?string $file = null): bool { + if (str_ends_with($file, '/php_api_reference/index.html')) { + return str_contains($url, '/namespaces/symfony-contracts') || str_contains($url, '/classes/Symfony-Contracts'); + } + return false; + }, + + ], + 'location' => [ + function (string $url, string $location, ?string $file = null): bool { + return str_starts_with($url, 'https://issues.ibexa.co/') && str_starts_with($location, 'https://issues.ibexa.co/login.jsp'); + }, + function (string $url, string $location, ?string $file = null): bool { + return str_starts_with($url, 'https://symfony.com/doc/7.3/') && str_starts_with($location, 'https://symfony.com/doc/current/'); + }, + function (string $url, string $location, ?string $file = null): bool { + return str_starts_with($url, 'https://youtu.be/') && explode('/', $url)[3] . '&feature=youtu.be' === explode('?v=', $location)[1]; + }, + function (string $url, string $location, ?string $file = null): bool { + return str_starts_with($url, 'https://www.facebook.com/') && str_starts_with($location, 'https://www.facebook.com/unsupportedbrowser'); + }, + function (string $url, string $location, ?string $file = null): bool { + return 'https://www.json.org/' === $url && 'https://www.JSON.org/json-en.html' === $location; + }, + function (string $url, string $location, ?string $file = null): bool { + return $url === 'https://console.aws.amazon.com/iam/home#/users' + && preg_match('@https://[a-z0-9-]+\.console\.aws\.amazon\.com/iam/home#/users@', $location); + }, + function (string $url, string $location, ?string $file = null): bool { + return 'https://www.paypal.com/bizsignup/#/singlePageSignup' === $url && str_starts_with($location, 'https://www.paypal.com/unifiedonboarding/entry'); + }, + ], + 'fragment' => [ + /*function (string $url, ?string $file = null): bool { + return str_ends_with($file, '.md') && ( + // ## Commerce [[% include 'snippets/commerce_badge.md' %]] + str_ends_with($url, '/permission_use_cases.md#commerce') + // ### Ensure proper Captcha behavior [[% include 'snippets/experience_badge.md' %]] [[% include 'snippets/commerce_badge.md' %]] + || str_ends_with($url, '/reverse_proxy.md#ensure-proper-captcha-behavior') + ); + },*/ + function (string $url, ?string $file = null): bool { + return str_starts_with($url, 'https://classic.yarnpkg.com/en/docs/') + || str_starts_with($url, 'https://ddev.readthedocs.io/'); + }, + function (string $url, ?string $file = null): bool { + return $url == 'https://www.paypal.com/bizsignup/#/singlePageSignup'; + } + ], +]); + +$curlUsageTests = [ + function (string $url, ?string $file = null): bool { + return str_starts_with($url, 'https://docs.aws.amazon.com/'); + }, + function (string $url, ?string $file = null): bool { + return str_starts_with($url, 'https://semver.org/'); + }, + function (string $url, ?string $file = null): bool { + return str_starts_with($url, 'https://cdn.jsdelivr.net/npm/'); + }, +]; + +$mkdocs = yaml_parse_file('mkdocs.yml'); + +$replacements = []; +foreach ($mkdocs['extra'] as $key => $value) { + if (is_string($value)) { + $replacements['[[= ' . $key . ' =]]'] = $value; + } +} + +$absoluteLinks = $mkdocs['validation']['links']['absolute_links'] ?? $mkdocs['validation']['absolute_links'] ?? 'info'; +if ('relative_to_docs' === $absoluteLinks) { + //TODO +} + +$find = './docs'; diff --git a/tools/links/links.php b/tools/links/links.php new file mode 100644 index 0000000000..ae320ca0dc --- /dev/null +++ b/tools/links/links.php @@ -0,0 +1,1817 @@ +url = $url; + $this->text = $text; + $this->file = $file; + $this->line = (int)$line; + $this->replacements = $replacements; + $this->find = $find; + $this->base = $base; + if ($test) { + $this->test(); + } + } + + /** + * Test/check the URL and fill its related properties (code, headers, etc.). + * + * @param bool $testLocations If it's a redirection (through a `location` header), test the redirect target + * @param bool $testFragment If the eventual fragment/hash/anchor part should be tested + * @param bool $cache Test again even if already tested + * @return $this Returns itself for chain like $this->test()->getCode() + */ + public function test(bool $testLocations = true, bool $testFragment = true, bool $useCurl = false, bool $cache = true): self + { + if (!$this->isTested() || !$cache) { + $test = self::testUrl($this->getSolvedUrl(), $this->isExternal(), $testFragment, $useCurl); + $this->headers = $test['headers']; + $this->code = $test['code']; + $this->location = null === $test['location'] ? null : new TestableUrl($test['location'], null, $this->getFile(), $this->getLine(), null, false, null, $testLocations); + $this->fragmentFound = $test['fragment_found']; + $this->tested = true; + } + + return $this; + } + + /** + * Has the URL been tested/checked. + * + * @return bool + */ + public function isTested(): bool + { + return $this->tested; + } + + /** + * Get the raw URL. + * + * @return string + */ + public function getUrl(): string + { + return $this->url; + } + + /** + * Execute URL transformations like replacements. + */ + public function getTransformedUrl(): string + { + if (null === $this->transformedUrl) { + $url = $this->getUrl(); + if (is_array($this->replacements)) { + $url = str_replace(array_keys($this->replacements), array_values($this->replacements), $url); + } + $this->transformedUrl = $url; + } + + return $this->transformedUrl; + } + + public static function getRelativePath($sourcePath, $targetPath): string + { + $sourcePathInfo = pathinfo($sourcePath); + $targetPathInfo = pathinfo($targetPath); + $sourceDir = '.' === $sourcePathInfo['dirname'] ? [] : explode('/', $sourcePathInfo['dirname']); + $targetDir = '.' === $targetPathInfo['dirname'] ? [] : explode('/', $targetPathInfo['dirname']); + while (!empty($sourceDir) && !empty($targetDir) && $sourceDir[0] === $targetDir[0]) { + // Remove common path + array_shift($sourceDir); + array_shift($targetDir); + } + while (!empty($sourceDir)) { + // Add descending directories `..` + array_shift($sourceDir); + array_unshift($targetDir, '..'); + } + + return (empty($targetDir) ? '' : implode('/', $targetDir) . '/') . $targetPathInfo['basename']; + } + + public static function solveRelativePath(string $sourcePath, string $targetPath): string + { + if ('/' === $targetPath[0]) { // Relative to root + if (preg_match('@^(?P[^:]+:)?//(?P[^/]+)@', $sourcePath, $matches)) { + $targetPath = "{$matches['scheme']}//{$matches['host']}$targetPath"; + } + return $targetPath; + } + + $targetPath = preg_replace('@^\./@', '', $targetPath); + + if ('/' === substr($sourcePath, -1)) { + $sourcePath .= 'tmp.tmp'; + } + + $sourcePathInfo = pathinfo($sourcePath); + if ('.' !== $sourcePathInfo['dirname']) { + // Add common path + $targetPath = "{$sourcePathInfo['dirname']}/$targetPath"; + // Remove descending directories `..` + $targetPathInfo = pathinfo($targetPath); + $targetDir = explode('/', $targetPathInfo['dirname']); + for ($i = 0; $i < count($targetDir); $i++) { + if ($i > 0 && '..' === $targetDir[$i]) { + array_splice($targetDir, $i - 1, 2); + $i -= 2; + } + } + $targetPath = (empty($targetDir) ? '' : implode('/', $targetDir) . '/') . $targetPathInfo['basename']; + } + + return $targetPath; + } + + /** + * Get testable URI. + * + * If the file where the URL has been found is known, + * solve the relative path of internal URL + * or append file URL to fragment. + * + * @return string + */ + public function getSolvedUrl(): string + { + if (null === $this->solvedUrl) { + $url = $this->getTransformedUrl(); + if ($this->isInternal() && !empty($this->base)) { + $url = $this->base . (str_ends_with($this->base, '/') ? '' : '/') . $url; + } + if ($this->isInternal() && !$this->isFragment() && is_dir(parse_url($url, PHP_URL_PATH))) { + $url .= (str_ends_with($url, '/') ? '' : '/') . 'index.html'; + } + if ($this->isFragment()) { + return $this->solvedUrl = $this->getFile() . $url; + } else if ($this->isInternal() && $this->hasFile() && !$this->find) { + return self::solveRelativePath($this->getFile(), $url); + } else if ($this->isInternal() && $this->find) { + if (str_starts_with($url, '/')) { + $findPrefix = is_string($this->find) ? $this->find : '*'; + } else { + $findPrefix = is_string($this->find) ? $this->find.'/*/' : '*/'; + } + $urlWithoutFragment = self::getUrlWithoutFragment($url); + $candidates = (new Finder('.'))->includeWholeName("{$findPrefix}{$urlWithoutFragment}")->find(); + if (1 === count($candidates)) { + $url = $candidates[0] . ($this->hasFragment() ? '#' . $this->getFragment() : ''); + } else if ($this->hasFile()) { + $url = self::solveRelativePath($this->getFile(), $url); + } + } + if (!str_starts_with($url, 'https://') && str_contains($url, 'https://') + || !str_starts_with($url, 'http://') && str_contains($url, 'http://')) { + throw new \RuntimeException("Malformed URL: {$url}"); + } + $this->solvedUrl = $url; + } + + return $this->solvedUrl; + } + + + public function hasText(): bool + { + return null !== $this->text; + } + + public function getText(): ?string + { + return $this->text; + } + + public function __toString(): string + { + return $this->getUrl() . ($this->hasText() ? " “{$this->getText()}”" : '') . ($this->hasLocation() ? " → {$this->getLocation()->getUrl()}" : ''); + } + + public function hasFile(): bool + { + return !empty($this->file); + } + + public function getFile(): ?string + { + return $this->file; + } + + public function getLine(): ?int + { + return (int)$this->line; + } + + public static function isExternalUrl(string $url): bool + { + return (bool)preg_match(self::PATTERN_DELIMITER . self::EXTERNAL_PATTERN . self::PATTERN_DELIMITER, $url); + } + + public function isExternal(): bool + { + if (null === $this->external) { + $this->external = self::isExternalUrl($this->getTransformedUrl()); + } + + return $this->external; + } + + public function isInternal(): bool + { + return !$this->isExternal(); + } + + public const NOT_TESTABLE_CODE = 999; + + /** + * Test/check a URL and return result data. + * + * If the URL is external, return the response HTTP headers (as an array), the HTTP status code as an integer, the eventual `location` HTTP header used for redirection, if there is a fragment to test, if that fragment has been found in the content available at the URL + * If the URL is internal, return the code 200 if the target exists, 404 if not, if there is a fragment to test, if that fragment has been found in the content available at the URL + * @param string $url The URL itself + * @param bool|null $external If it's an external URL (`true`) or not (`false`), if `null`, the system try to guess it using {@see \TestableUrl::isExternalUrl()} + * @param bool $testFragment If the eventual fragment/hash/anchor part should be tested + * @param int $retryCount Number of retries on time out + * @param int $retryDelay Delay in seconds before retrying + * @param int $tryNumber Try number (first try is numbered 1) + * @return array ['headers' => null|array, 'code' => int, 'location' => null|string, 'fragment_found' => null|bool] + */ + public static function testUrl(string $url, ?bool $external = null, bool $testFragment = true, bool $useCurl = false, int $retryCount = 1, int $retryDelay = 300, int $tryNumber = 1): array + { + $headers = null; + $code = self::NOT_TESTABLE_CODE; + $location = null; + $fragmentFound = null; + $isPlainText = false; + + if (null === $external) { + $external = self::isExternalUrl($url); + } + + if ($external) { + $defaultScheme = self::DEFAULT_SCHEME; + $headers = self::requestHeaders('//' === substr($url, 0, 2) ? "$defaultScheme:$url" : $url, $useCurl); + if ($headers && count($headers)) { + if (strlen($headers[0])) { + $firstLinePart = explode(' ', $headers[0]); + $code = (int)$firstLinePart[1]; + } + foreach($headers as $header) { + if (str_starts_with(strtolower($header), 'content-type: ')) { + $value = trim(explode(': ', $header, 2)[1]); + $isPlainText = self::isPlainTextMimeType($value); + } + } + } + } else { + if (str_starts_with($url, '/')) { + return self::testUrl(".$url", $external, $testFragment, $useCurl, $retryCount, $retryDelay, $tryNumber); + //TODO: || return self::testUrl("$docRoot$url", $external, $testFragment, $useCurl, $retryCount, $retryDelay, $tryNumber); + } + $code = file_exists(parse_url($url, PHP_URL_PATH)/* No query (?) nor fragment (#) */) ? 200 : 404; + $extension = pathinfo(TestableUrl::getUrlWithoutFragment($url), PATHINFO_EXTENSION); + if (!empty($extension)) { + $isPlainText = self::isPlainTextExtension($extension); + } + } + + switch ($code) { + case 200: // OK + if ($isPlainText) { + $contents = $external || $testFragment && self::isUrlWithFragment($url) ? self::requestBody(self::getUrlWithoutFragment($url), $useCurl) : ''; + if (false === $contents) { + //TODO + break; + } + $refreshTagPattern = '@[^"]+)"@i'; + if ($external && preg_match($refreshTagPattern, $contents, $matches)) { // Soft redirect + $location = preg_match('@^https?://@', $matches['url']) ? $matches['url'] : self::solveRelativePath(self::getUrlWithoutFragment($url), $matches['url']); + if (self::isUrlWithFragment($url)) { + $location .= '#' . self::getUrlFragment($url); + } + } elseif ($testFragment && self::isUrlWithFragment($url)) { + $fragment = self::getUrlFragment($url); + $fragmentFound = $contents && 1 === preg_match("@(id|name)=\"$fragment\"@", $contents); + if (!$fragmentFound && !self::isExternalUrl($url)) { + if ('md' === pathinfo(TestableUrl::getUrlWithoutFragment($url), PATHINFO_EXTENSION)) { + //TODO: MarkDown fragment search pattern should be a config. + //$pattern = '@^#+\W*' . str_replace('-', '\W+', $fragment) . '\W*$@mi'; + $pattern = '@^ *#+\W*' . str_replace('-', '\W+', $fragment) . '\W*( \[\[% include \'.+_badge.md\' %\]\])*$@mi'; + $fragmentFound = (bool)preg_match($pattern, $contents); + //TODO: Alternatively, Markdown headers could extracted, converted to anchors, and then compared + } + } + } + } + break; + case 301: // Moved Permanently + case 302: // Found + case 303: // See Other + case 307: // Temporary Redirect + case 308: // Permanent Redirect + foreach ($headers as $header) { + if (str_starts_with(strtolower($header), 'location: ')) { + if (false !== preg_match('/^[Ll]ocation: (?P.*)$/', $header, $matches)) { + $parsedUrl = parse_url($url); + $parsedLocation = parse_url($matches['location']); + foreach (array_keys($parsedUrl) as $key) { + if (!array_key_exists($key, $parsedLocation)) { + $parsedLocation[$key] = $parsedUrl[$key]; + } + } + if (array_key_exists('query', $parsedLocation) && !empty($parsedLocation['query'])) { + $query = '?' . $parsedLocation['query']; + } else { + $query = ''; + } + if (array_key_exists('fragment', $parsedLocation) && !empty($parsedLocation['fragment']) && '#' !== $parsedLocation['fragment'][0]) { + $fragment = '#' . $parsedLocation['fragment']; + } else { + $fragment = ''; + } + $location = "{$parsedLocation['scheme']}://{$parsedLocation['host']}{$parsedLocation['path']}{$query}$fragment"; + break; + } + } + } + break; + case 429: // Too Many Requests + case 503: // Service Unavailable + foreach ($headers as $header) { + if (str_starts_with(strtolower($header), 'retry-after: ')) { + $value = trim(explode(': ', $header, 2)[1]); + if (is_numeric($value)) { + $delay = intval($value); + if ($delay && $delay <= $retryDelay * ($retryCount-1)) { + sleep($delay); + } + } + } + } + case 522: // Connection Timed Out + if ($tryNumber <= $retryCount) { + sleep($retryDelay); + return self::testUrl($url, $external, $testFragment, $useCurl, $retryCount, $retryDelay, $tryNumber++); + } + case 400: // Bad Request + case 401: // Unauthorized + case 403: // Forbidden + case 404: // Not Found + case 405: // Method Not Allowed + case 455: // Method Not Valid in This State + case 500: // Internal Server Error + case 521: // Web Server Is Down + default: + } + + return [ + 'headers' => $headers, + 'code' => $code, + 'location' => $location, + 'fragment_found' => $fragmentFound, + ]; + } + + public static function requestHeaders(string $url, bool $useCurl = false): array + { + if ($useCurl) { + return preg_split('/\R/', trim(shell_exec("curl -s -I $url") ?? '')); + } + return @get_headers($url) ?: []; + } + + public static function requestBody(string $url, bool $useCurl = false): string + { + if ($useCurl) { + return shell_exec("curl -s $url") ?? ''; + } + return @file_get_contents($url) ?: ''; + } + + /** @return string[] */ + public function getHeaders(): array + { + return $this->headers ?: []; + } + + public function getCode(): string + { + if (empty($this->code)) { + return ' '; + } + + return str_pad((string)$this->code, 3, '0', STR_PAD_LEFT); + } + + public function hasLocation(): bool + { + return null !== $this->location; + } + + public function getLocation(): ?TestableUrl + { + return $this->location; + } + + /** @return TestableUrl[] */ + public function getLocations(): array + { + $locations = []; + $location = $this; + while ($location->hasLocation()) { + $location = $location->getLocation(); + $locations[] = $location; + } + return $locations; + } + + /** @return string[] */ + public function getUrls(): array + { + $urls = [$this->getUrl()]; + foreach ($this->getLocations() as $location) { + $urls[] = $location->getUrl(); + } + return $urls; + } + + public static function isPlainTextMimeType(string $mimeType): bool + { + foreach (self::PLAIN_TEXT_MIME_TYPES as $plainTextMimeType) { + if (str_starts_with($mimeType, $plainTextMimeType)) { + return true; + } + } + + return false; + } + + public static function isPlainTextExtension(string $extension): bool + { + if (str_starts_with($extension, '.')) { + $extension = substr($extension, 1); + } + + foreach (self::PLAIN_TEXT_EXTENSIONS as $plainTextExtension) { + if ($extension === $plainTextExtension) { + return true; + } + } + + return false; + } + + public static function isUrlWithFragment($url): bool + { + return str_contains($url, '#') && !str_ends_with($url, '#'); + } + + public function hasFragment(): bool + { + return self::isUrlWithFragment($this->getUrl()); + } + + public static function getUrlFragment($url): string + { + return parse_url($url, PHP_URL_FRAGMENT); + } + + public static function getUrlWithoutFragment($url): string + { + if (self::isUrlWithFragment($url)) { + return str_replace('#' . self::getUrlFragment($url), '', $url); + } + return $url; + } + + public function getFragment(): string + { + return self::getUrlFragment($this->getUrl()); + } + + private function isFragment(): bool + { + return str_starts_with($this->getUrl(), '#'); + } + + public function isFragmentFound(): ?bool + { + return $this->fragmentFound; + } +} + + +class UrlExtractor +{ + /** @var array[] */ + private $patterns; + + /** @var array|null */ + private $replacements; + + /** @var bool|null */ + private $find; + + private const PATTERN_DELIMITER = '@'; + private const LINE_PATTERN = '(?P[0-9]+)?:?'; + + /** + * @param array[]|null $patterns A map of file extensions and pattern lists. + * @param string[]|null $replacements A map of replacements ['what_to_replace'=>'by_what_to replace'] + * @param bool|string $find If the URL target must be searched for instead of just being considered as a relative path; If a string is given, it will be used as search prefix + * @see UrlExtractor::getDefaultPatterns For an example of pattern map. + */ + public function __construct(?array $patterns = null, ?array $replacements = null, mixed $find = false) + { + $this->patterns = null === $patterns ? self::getDefaultPatterns() : $patterns; + $this->flattenPatterns(); + $this->replacements = $replacements; + $this->find = $find;//TODO remove trailing slash + } + + /** @return TestableUrl[] */ + public function extract(string $file): array + { + $base = null; + + $extension = pathinfo($file, PATHINFO_EXTENSION); + if ('zip' === $extension) { + return $this->extractFromArchive($file); + } elseif ('html' === $extension) { + $base = $this->extractBase($file); + } + + //var_dump($this->getGrepCommand($file)); + $grepOutput = trim(shell_exec($this->getGrepCommand($file)) ?? ''); + if (empty($grepOutput)) { + return []; + } + + $grepLines = explode("\n", $grepOutput); + unset($grepOutput); + + $urls = []; + + $line = 0; + $linePattern = self::LINE_PATTERN; + $patterns = $this->getPhpPatterns($extension); + $patternDelimiter = self::PATTERN_DELIMITER; + for ($index = 0; $index < count($grepLines); ++$index) { + $grepLine = $grepLines[$index]; + $matches = []; + foreach ($patterns as $pattern) { + if (preg_match("{$patternDelimiter}{$linePattern}{$pattern}{$patternDelimiter}", $grepLine, $matches)) { + break; + } + } + if (array_key_exists('line', $matches) && strlen($matches['line'])) { + $line = $matches['line']; + } + if (empty($matches['url'])) { + continue; + } + $urls[] = new TestableUrl($matches['url'], array_key_exists('text', $matches) ? $matches['text'] : null, $file, $line, $this->replacements, $this->find, $base); + } + + return $urls; + } + + /** @return TestableUrl[] */ + private function extractFromArchive(string $archivePath): array + { + $urls = []; + + $dir = pathinfo($archivePath, PATHINFO_DIRNAME); + $rawFileList = explode("\n", shell_exec("unzip -l $archivePath")); + $rawFileList = array_slice($rawFileList, 3, -3); + foreach ($rawFileList as $rawFileLine) { + if (!preg_match('@ *(?P[^ ]+) +(?P[^ ]+) +(?P