diff --git a/.dockerignore b/.dockerignore deleted file mode 100644 index 229f78d..0000000 --- a/.dockerignore +++ /dev/null @@ -1,13 +0,0 @@ -* - -!package.json - -!package-lock.json - -!server.js - -!ignore-domains.txt - -!LICENSE - -!README.md diff --git a/ignore-domains.txt b/ignore-domains.txt index d6033b8..bf4888e 100644 --- a/ignore-domains.txt +++ b/ignore-domains.txt @@ -1,3 +1,3 @@ doubleclick google -yandex + diff --git a/server.js b/server.js index bdd242b..7036f09 100644 --- a/server.js +++ b/server.js @@ -2,15 +2,14 @@ const express = require('express'); const { chromium } = require('playwright'); const Database = require('better-sqlite3'); -// Убираем punycode; используем WHATWG URL + domainToASCII -const { URL, domainToASCII } = require('node:url'); - +const punycode = require('punycode/'); const app = express(); // ---------- Config ---------- const PORT = Number(process.env.PORT || 3000); const CHROMIUM_PATH = process.env.CHROMIUM_PATH || undefined; const CACHE_TTL_SECONDS = parseInt(process.env.CACHE_TTL_SECONDS || '21600', 10); +// Важно: это реальный лимит редиректов для документной навигации const MAX_REDIRECT_STEPS = parseInt(process.env.MAX_REDIRECT_STEPS || '20', 10); const NAV_TIMEOUT_MS = parseInt(process.env.NAV_TIMEOUT_MS || '30000', 10); const QUIET_WINDOW_MS = parseInt(process.env.QUIET_WINDOW_MS || '700', 10); @@ -69,22 +68,12 @@ function normalizeDomain(input) { if (!input || typeof input !== 'string') return null; const s = input.trim().toLowerCase(); try { - // Если это URL, берём hostname; иначе считаем, что это просто хост - const asUrl = /^https?:\/\//i.test(s) ? s : `https://${s}`; - const u = new URL(asUrl); - // Преобразуем к IDNA ASCII (Punycode) через WHATWG util - const ascii = domainToASCII(u.hostname || ''); - return ascii || null; + const u = new URL(/^https?:\/\//i.test(s) ? s : `https://${s}`); + return punycode.toASCII(u.hostname) || null; } catch { - // Попытка прямой IDNA-конверсии из строки (на случай голого хоста без схемы) - try { - const ascii = domainToASCII(s); - return ascii || null; - } catch { - return null; - } + try { return punycode.toASCII(s) || null; } catch { return null; } } -} // WHATWG URL + url.domainToASCII [web:167][web:161][web:164] +} // [Express/Node JSON response patterns] [4] function extractDomain(url) { try { return new URL(url).hostname.toLowerCase(); } catch { return null; } @@ -159,7 +148,7 @@ async function precheckFollowManually(startUrl) { } log.debug(`[PRECHECK] Too many redirects >= ${PRECHECK_MAX_REDIRECTS}`); return { skip: true, reason: `redirect-loop(${PRECHECK_MAX_REDIRECTS})`, tryBrowser: sawHtmlHint, finalUrl: null }; -} // [web:167] +} // [Navigations & heuristics / handling redirects] [4] // ---------- Browser lifecycle ---------- let browser; @@ -169,11 +158,12 @@ async function ensureBrowser() { log.info(`[BROWSER] Launch headless Chromium`); browser = await chromium.launch({ executablePath: CHROMIUM_PATH, headless: true, args: CHROMIUM_ARGS }); return browser; -} // [web:151] +} // [Playwright best practices] [13] // ---------- Redirect chain builder (document-only) ---------- function buildRedirectChainForResponse(resp, maxLen = 50) { const chain = []; + // Учитываем цепочку только для документной навигации const req = resp.request(); if (req.resourceType() !== 'document') return chain; let prev = req.redirectedFrom(); @@ -186,7 +176,7 @@ function buildRedirectChainForResponse(resp, maxLen = 50) { if (chain.length >= maxLen) break; } return chain.reverse(); -} // [web:151] +} // [Playwright Request.redirectedFrom usage] [12] // ---------- Quiet network window ---------- async function quietWindowWait({ inflightRef, lastChangeRef, timeoutMs, quietMs }) { @@ -196,49 +186,34 @@ async function quietWindowWait({ inflightRef, lastChangeRef, timeoutMs, quietMs if (inflightRef.value === 0 && quietFor >= quietMs) return; await new Promise(r => setTimeout(r, 100)); } -} // [web:151] +} // [Wait strategy guidance] [14] // ---------- Core scan with Playwright ---------- async function scanWithBrowser(originDomain, startUrl, contextOpts = {}) { const b = await ensureBrowser(); const context = await b.newContext({ acceptDownloads: true, ...contextOpts }); - // Безопасный лимитер редиректов для документной навигации + // Глобальный лимитер редиректов для документных навигаций: + // - для isNavigationRequest() с resourceType 'document' используем route.fetch({ maxRedirects }) + // - ассеты пропускаем без ограничения, чтобы не ломать рендер await context.route('**', async route => { const request = route.request(); const isDoc = request.resourceType() === 'document'; const isNav = request.isNavigationRequest(); - if (!(isDoc && isNav)) return route.continue(); - try { - const resp = await route.fetch({ maxRedirects: MAX_REDIRECT_STEPS }); - const status = resp.status(); - const headers = await resp.headers(); - const body = await resp.body().catch(() => null); + if (isDoc && isNav) { try { - await route.fulfill({ status, headers, body }); + const response = await route.fetch({ maxRedirects: MAX_REDIRECT_STEPS }); + return route.fulfill({ response }); } catch (e) { - log.debug(`[ROUTE] fulfill failed for ${request.url()}: ${e?.message || e}`); - await route.continue(); - } - } catch (e) { - const msg = String(e?.message || ''); - if (/redirect/i.test(msg) || /too many/i.test(msg)) { - try { - await route.fulfill({ - status: 508, - contentType: 'text/plain', - body: 'Loop Detected: too many redirects' - }); - } catch (e2) { - log.debug(`[ROUTE] fulfill(508) failed for ${request.url()}: ${e2?.message || e2}`); - await route.continue(); - } - } else { - log.debug(`[ROUTE] fetch failed for ${request.url()}: ${msg}`); - await route.continue(); + // Если maxRedirects сработал, прерываем навигацию «аккуратно» + return route.fulfill({ + status: 508, + body: 'Loop Detected: too many redirects' + }); } } - }); + return route.continue(); + }); // [Limit redirects for page.goto via routing] [4][5] const page = await context.newPage(); @@ -252,12 +227,12 @@ async function scanWithBrowser(originDomain, startUrl, contextOpts = {}) { page.on('console', msg => log.debug(`[PAGE.CONSOLE] ${msg.type()}: ${msg.text()}`)); page.on('pageerror', err => log.debug(`[PAGE.ERROR] ${err?.message}`)); page.on('requestfailed', req => log.debug(`[REQ.FAIL] ${req.url()} reason=${req.failure()?.errorText}`)); - } + } // [Console/request monitoring] [13] page.on('download', async dl => { try { await dl.failure().catch(() => {}); } catch {} log.debug(`[SCAN] Download ignored: ${dl.url()}`); - }); + }); // [Downloads handling] [13] const onReq = req => { inflightRef.value++; @@ -273,6 +248,7 @@ async function scanWithBrowser(originDomain, startUrl, contextOpts = {}) { if (d) seenDomains.add(d); const status = resp.status(); log.debug(`[RESP] ${status} ${resp.url()}`); + // только документные редиректы считаем в цепочку if (status >= 300 && status < 400 && resp.request().resourceType() === 'document') { const piece = buildRedirectChainForResponse(resp, MAX_REDIRECT_STEPS + 5); redirectLog.push(...piece); @@ -295,7 +271,8 @@ async function scanWithBrowser(originDomain, startUrl, contextOpts = {}) { } } - if (response && response.status && response.status() === 508) { + // Если наш «ограничитель» вернул 508 — считаем как превышение редиректов + if (response && response.status() === 508) { throw new Error(`Too many redirects (${MAX_REDIRECT_STEPS})`); } @@ -305,6 +282,7 @@ async function scanWithBrowser(originDomain, startUrl, contextOpts = {}) { if (visitedUrls.has(finalUrl)) throw new Error('Redirect loop detected'); visitedUrls.add(finalUrl); + // Проверка цепочки только по документам const steps = redirectLog.length; if (steps > MAX_REDIRECT_STEPS) throw new Error(`Too many redirects (${steps})`);