Compare commits

..

No commits in common. "d6d6027a17d4a885491bdef9d95af537e0ae0fa2" and "35b0ff1cd5a9fcc04900a5b2c86ea15c92fb918b" have entirely different histories.

3 changed files with 31 additions and 66 deletions

View file

@ -1,13 +0,0 @@
*
!package.json
!package-lock.json
!server.js
!ignore-domains.txt
!LICENSE
!README.md

View file

@ -1,3 +1,3 @@
doubleclick
google
yandex

View file

@ -2,15 +2,14 @@
const express = require('express');
const { chromium } = require('playwright');
const Database = require('better-sqlite3');
// Убираем punycode; используем WHATWG URL + domainToASCII
const { URL, domainToASCII } = require('node:url');
const punycode = require('punycode/');
const app = express();
// ---------- Config ----------
const PORT = Number(process.env.PORT || 3000);
const CHROMIUM_PATH = process.env.CHROMIUM_PATH || undefined;
const CACHE_TTL_SECONDS = parseInt(process.env.CACHE_TTL_SECONDS || '21600', 10);
// Важно: это реальный лимит редиректов для документной навигации
const MAX_REDIRECT_STEPS = parseInt(process.env.MAX_REDIRECT_STEPS || '20', 10);
const NAV_TIMEOUT_MS = parseInt(process.env.NAV_TIMEOUT_MS || '30000', 10);
const QUIET_WINDOW_MS = parseInt(process.env.QUIET_WINDOW_MS || '700', 10);
@ -69,22 +68,12 @@ function normalizeDomain(input) {
if (!input || typeof input !== 'string') return null;
const s = input.trim().toLowerCase();
try {
// Если это URL, берём hostname; иначе считаем, что это просто хост
const asUrl = /^https?:\/\//i.test(s) ? s : `https://${s}`;
const u = new URL(asUrl);
// Преобразуем к IDNA ASCII (Punycode) через WHATWG util
const ascii = domainToASCII(u.hostname || '');
return ascii || null;
const u = new URL(/^https?:\/\//i.test(s) ? s : `https://${s}`);
return punycode.toASCII(u.hostname) || null;
} catch {
// Попытка прямой IDNA-конверсии из строки (на случай голого хоста без схемы)
try {
const ascii = domainToASCII(s);
return ascii || null;
} catch {
return null;
}
try { return punycode.toASCII(s) || null; } catch { return null; }
}
} // WHATWG URL + url.domainToASCII [web:167][web:161][web:164]
} // [Express/Node JSON response patterns] [4]
function extractDomain(url) {
try { return new URL(url).hostname.toLowerCase(); } catch { return null; }
@ -159,7 +148,7 @@ async function precheckFollowManually(startUrl) {
}
log.debug(`[PRECHECK] Too many redirects >= ${PRECHECK_MAX_REDIRECTS}`);
return { skip: true, reason: `redirect-loop(${PRECHECK_MAX_REDIRECTS})`, tryBrowser: sawHtmlHint, finalUrl: null };
} // [web:167]
} // [Navigations & heuristics / handling redirects] [4]
// ---------- Browser lifecycle ----------
let browser;
@ -169,11 +158,12 @@ async function ensureBrowser() {
log.info(`[BROWSER] Launch headless Chromium`);
browser = await chromium.launch({ executablePath: CHROMIUM_PATH, headless: true, args: CHROMIUM_ARGS });
return browser;
} // [web:151]
} // [Playwright best practices] [13]
// ---------- Redirect chain builder (document-only) ----------
function buildRedirectChainForResponse(resp, maxLen = 50) {
const chain = [];
// Учитываем цепочку только для документной навигации
const req = resp.request();
if (req.resourceType() !== 'document') return chain;
let prev = req.redirectedFrom();
@ -186,7 +176,7 @@ function buildRedirectChainForResponse(resp, maxLen = 50) {
if (chain.length >= maxLen) break;
}
return chain.reverse();
} // [web:151]
} // [Playwright Request.redirectedFrom usage] [12]
// ---------- Quiet network window ----------
async function quietWindowWait({ inflightRef, lastChangeRef, timeoutMs, quietMs }) {
@ -196,49 +186,34 @@ async function quietWindowWait({ inflightRef, lastChangeRef, timeoutMs, quietMs
if (inflightRef.value === 0 && quietFor >= quietMs) return;
await new Promise(r => setTimeout(r, 100));
}
} // [web:151]
} // [Wait strategy guidance] [14]
// ---------- Core scan with Playwright ----------
async function scanWithBrowser(originDomain, startUrl, contextOpts = {}) {
const b = await ensureBrowser();
const context = await b.newContext({ acceptDownloads: true, ...contextOpts });
// Безопасный лимитер редиректов для документной навигации
// Глобальный лимитер редиректов для документных навигаций:
// - для isNavigationRequest() с resourceType 'document' используем route.fetch({ maxRedirects })
// - ассеты пропускаем без ограничения, чтобы не ломать рендер
await context.route('**', async route => {
const request = route.request();
const isDoc = request.resourceType() === 'document';
const isNav = request.isNavigationRequest();
if (!(isDoc && isNav)) return route.continue();
try {
const resp = await route.fetch({ maxRedirects: MAX_REDIRECT_STEPS });
const status = resp.status();
const headers = await resp.headers();
const body = await resp.body().catch(() => null);
if (isDoc && isNav) {
try {
await route.fulfill({ status, headers, body });
const response = await route.fetch({ maxRedirects: MAX_REDIRECT_STEPS });
return route.fulfill({ response });
} catch (e) {
log.debug(`[ROUTE] fulfill failed for ${request.url()}: ${e?.message || e}`);
await route.continue();
}
} catch (e) {
const msg = String(e?.message || '');
if (/redirect/i.test(msg) || /too many/i.test(msg)) {
try {
await route.fulfill({
status: 508,
contentType: 'text/plain',
body: 'Loop Detected: too many redirects'
});
} catch (e2) {
log.debug(`[ROUTE] fulfill(508) failed for ${request.url()}: ${e2?.message || e2}`);
await route.continue();
}
} else {
log.debug(`[ROUTE] fetch failed for ${request.url()}: ${msg}`);
await route.continue();
// Если maxRedirects сработал, прерываем навигацию «аккуратно»
return route.fulfill({
status: 508,
body: 'Loop Detected: too many redirects'
});
}
}
});
return route.continue();
}); // [Limit redirects for page.goto via routing] [4][5]
const page = await context.newPage();
@ -252,12 +227,12 @@ async function scanWithBrowser(originDomain, startUrl, contextOpts = {}) {
page.on('console', msg => log.debug(`[PAGE.CONSOLE] ${msg.type()}: ${msg.text()}`));
page.on('pageerror', err => log.debug(`[PAGE.ERROR] ${err?.message}`));
page.on('requestfailed', req => log.debug(`[REQ.FAIL] ${req.url()} reason=${req.failure()?.errorText}`));
}
} // [Console/request monitoring] [13]
page.on('download', async dl => {
try { await dl.failure().catch(() => {}); } catch {}
log.debug(`[SCAN] Download ignored: ${dl.url()}`);
});
}); // [Downloads handling] [13]
const onReq = req => {
inflightRef.value++;
@ -273,6 +248,7 @@ async function scanWithBrowser(originDomain, startUrl, contextOpts = {}) {
if (d) seenDomains.add(d);
const status = resp.status();
log.debug(`[RESP] ${status} ${resp.url()}`);
// только документные редиректы считаем в цепочку
if (status >= 300 && status < 400 && resp.request().resourceType() === 'document') {
const piece = buildRedirectChainForResponse(resp, MAX_REDIRECT_STEPS + 5);
redirectLog.push(...piece);
@ -295,7 +271,8 @@ async function scanWithBrowser(originDomain, startUrl, contextOpts = {}) {
}
}
if (response && response.status && response.status() === 508) {
// Если наш «ограничитель» вернул 508 — считаем как превышение редиректов
if (response && response.status() === 508) {
throw new Error(`Too many redirects (${MAX_REDIRECT_STEPS})`);
}
@ -305,6 +282,7 @@ async function scanWithBrowser(originDomain, startUrl, contextOpts = {}) {
if (visitedUrls.has(finalUrl)) throw new Error('Redirect loop detected');
visitedUrls.add(finalUrl);
// Проверка цепочки только по документам
const steps = redirectLog.length;
if (steps > MAX_REDIRECT_STEPS) throw new Error(`Too many redirects (${steps})`);