Remove punnycode from server code, unlink old pcode library
This commit is contained in:
parent
35b0ff1cd5
commit
a038862553
2 changed files with 65 additions and 30 deletions
13
.dockerignore
Normal file
13
.dockerignore
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
*
|
||||
|
||||
!package.json
|
||||
|
||||
!package-lock.json
|
||||
|
||||
!server.js
|
||||
|
||||
!ignore-domains.txt
|
||||
|
||||
!LICENSE
|
||||
|
||||
!README.md
|
||||
82
server.js
82
server.js
|
|
@ -2,14 +2,15 @@
|
|||
const express = require('express');
|
||||
const { chromium } = require('playwright');
|
||||
const Database = require('better-sqlite3');
|
||||
const punycode = require('punycode/');
|
||||
// Убираем punycode; используем WHATWG URL + domainToASCII
|
||||
const { URL, domainToASCII } = require('node:url');
|
||||
|
||||
const app = express();
|
||||
|
||||
// ---------- Config ----------
|
||||
const PORT = Number(process.env.PORT || 3000);
|
||||
const CHROMIUM_PATH = process.env.CHROMIUM_PATH || undefined;
|
||||
const CACHE_TTL_SECONDS = parseInt(process.env.CACHE_TTL_SECONDS || '21600', 10);
|
||||
// Важно: это реальный лимит редиректов для документной навигации
|
||||
const MAX_REDIRECT_STEPS = parseInt(process.env.MAX_REDIRECT_STEPS || '20', 10);
|
||||
const NAV_TIMEOUT_MS = parseInt(process.env.NAV_TIMEOUT_MS || '30000', 10);
|
||||
const QUIET_WINDOW_MS = parseInt(process.env.QUIET_WINDOW_MS || '700', 10);
|
||||
|
|
@ -68,12 +69,22 @@ function normalizeDomain(input) {
|
|||
if (!input || typeof input !== 'string') return null;
|
||||
const s = input.trim().toLowerCase();
|
||||
try {
|
||||
const u = new URL(/^https?:\/\//i.test(s) ? s : `https://${s}`);
|
||||
return punycode.toASCII(u.hostname) || null;
|
||||
// Если это URL, берём hostname; иначе считаем, что это просто хост
|
||||
const asUrl = /^https?:\/\//i.test(s) ? s : `https://${s}`;
|
||||
const u = new URL(asUrl);
|
||||
// Преобразуем к IDNA ASCII (Punycode) через WHATWG util
|
||||
const ascii = domainToASCII(u.hostname || '');
|
||||
return ascii || null;
|
||||
} catch {
|
||||
try { return punycode.toASCII(s) || null; } catch { return null; }
|
||||
// Попытка прямой IDNA-конверсии из строки (на случай голого хоста без схемы)
|
||||
try {
|
||||
const ascii = domainToASCII(s);
|
||||
return ascii || null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
} // [Express/Node JSON response patterns] [4]
|
||||
} // WHATWG URL + url.domainToASCII [web:167][web:161][web:164]
|
||||
|
||||
function extractDomain(url) {
|
||||
try { return new URL(url).hostname.toLowerCase(); } catch { return null; }
|
||||
|
|
@ -148,7 +159,7 @@ async function precheckFollowManually(startUrl) {
|
|||
}
|
||||
log.debug(`[PRECHECK] Too many redirects >= ${PRECHECK_MAX_REDIRECTS}`);
|
||||
return { skip: true, reason: `redirect-loop(${PRECHECK_MAX_REDIRECTS})`, tryBrowser: sawHtmlHint, finalUrl: null };
|
||||
} // [Navigations & heuristics / handling redirects] [4]
|
||||
} // [web:167]
|
||||
|
||||
// ---------- Browser lifecycle ----------
|
||||
let browser;
|
||||
|
|
@ -158,12 +169,11 @@ async function ensureBrowser() {
|
|||
log.info(`[BROWSER] Launch headless Chromium`);
|
||||
browser = await chromium.launch({ executablePath: CHROMIUM_PATH, headless: true, args: CHROMIUM_ARGS });
|
||||
return browser;
|
||||
} // [Playwright best practices] [13]
|
||||
} // [web:151]
|
||||
|
||||
// ---------- Redirect chain builder (document-only) ----------
|
||||
function buildRedirectChainForResponse(resp, maxLen = 50) {
|
||||
const chain = [];
|
||||
// Учитываем цепочку только для документной навигации
|
||||
const req = resp.request();
|
||||
if (req.resourceType() !== 'document') return chain;
|
||||
let prev = req.redirectedFrom();
|
||||
|
|
@ -176,7 +186,7 @@ function buildRedirectChainForResponse(resp, maxLen = 50) {
|
|||
if (chain.length >= maxLen) break;
|
||||
}
|
||||
return chain.reverse();
|
||||
} // [Playwright Request.redirectedFrom usage] [12]
|
||||
} // [web:151]
|
||||
|
||||
// ---------- Quiet network window ----------
|
||||
async function quietWindowWait({ inflightRef, lastChangeRef, timeoutMs, quietMs }) {
|
||||
|
|
@ -186,34 +196,49 @@ async function quietWindowWait({ inflightRef, lastChangeRef, timeoutMs, quietMs
|
|||
if (inflightRef.value === 0 && quietFor >= quietMs) return;
|
||||
await new Promise(r => setTimeout(r, 100));
|
||||
}
|
||||
} // [Wait strategy guidance] [14]
|
||||
} // [web:151]
|
||||
|
||||
// ---------- Core scan with Playwright ----------
|
||||
async function scanWithBrowser(originDomain, startUrl, contextOpts = {}) {
|
||||
const b = await ensureBrowser();
|
||||
const context = await b.newContext({ acceptDownloads: true, ...contextOpts });
|
||||
|
||||
// Глобальный лимитер редиректов для документных навигаций:
|
||||
// - для isNavigationRequest() с resourceType 'document' используем route.fetch({ maxRedirects })
|
||||
// - ассеты пропускаем без ограничения, чтобы не ломать рендер
|
||||
// Безопасный лимитер редиректов для документной навигации
|
||||
await context.route('**', async route => {
|
||||
const request = route.request();
|
||||
const isDoc = request.resourceType() === 'document';
|
||||
const isNav = request.isNavigationRequest();
|
||||
if (isDoc && isNav) {
|
||||
if (!(isDoc && isNav)) return route.continue();
|
||||
try {
|
||||
const resp = await route.fetch({ maxRedirects: MAX_REDIRECT_STEPS });
|
||||
const status = resp.status();
|
||||
const headers = await resp.headers();
|
||||
const body = await resp.body().catch(() => null);
|
||||
try {
|
||||
const response = await route.fetch({ maxRedirects: MAX_REDIRECT_STEPS });
|
||||
return route.fulfill({ response });
|
||||
await route.fulfill({ status, headers, body });
|
||||
} catch (e) {
|
||||
// Если maxRedirects сработал, прерываем навигацию «аккуратно»
|
||||
return route.fulfill({
|
||||
status: 508,
|
||||
body: 'Loop Detected: too many redirects'
|
||||
});
|
||||
log.debug(`[ROUTE] fulfill failed for ${request.url()}: ${e?.message || e}`);
|
||||
await route.continue();
|
||||
}
|
||||
} catch (e) {
|
||||
const msg = String(e?.message || '');
|
||||
if (/redirect/i.test(msg) || /too many/i.test(msg)) {
|
||||
try {
|
||||
await route.fulfill({
|
||||
status: 508,
|
||||
contentType: 'text/plain',
|
||||
body: 'Loop Detected: too many redirects'
|
||||
});
|
||||
} catch (e2) {
|
||||
log.debug(`[ROUTE] fulfill(508) failed for ${request.url()}: ${e2?.message || e2}`);
|
||||
await route.continue();
|
||||
}
|
||||
} else {
|
||||
log.debug(`[ROUTE] fetch failed for ${request.url()}: ${msg}`);
|
||||
await route.continue();
|
||||
}
|
||||
}
|
||||
return route.continue();
|
||||
}); // [Limit redirects for page.goto via routing] [4][5]
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
|
||||
|
|
@ -227,12 +252,12 @@ async function scanWithBrowser(originDomain, startUrl, contextOpts = {}) {
|
|||
page.on('console', msg => log.debug(`[PAGE.CONSOLE] ${msg.type()}: ${msg.text()}`));
|
||||
page.on('pageerror', err => log.debug(`[PAGE.ERROR] ${err?.message}`));
|
||||
page.on('requestfailed', req => log.debug(`[REQ.FAIL] ${req.url()} reason=${req.failure()?.errorText}`));
|
||||
} // [Console/request monitoring] [13]
|
||||
}
|
||||
|
||||
page.on('download', async dl => {
|
||||
try { await dl.failure().catch(() => {}); } catch {}
|
||||
log.debug(`[SCAN] Download ignored: ${dl.url()}`);
|
||||
}); // [Downloads handling] [13]
|
||||
});
|
||||
|
||||
const onReq = req => {
|
||||
inflightRef.value++;
|
||||
|
|
@ -248,7 +273,6 @@ async function scanWithBrowser(originDomain, startUrl, contextOpts = {}) {
|
|||
if (d) seenDomains.add(d);
|
||||
const status = resp.status();
|
||||
log.debug(`[RESP] ${status} ${resp.url()}`);
|
||||
// только документные редиректы считаем в цепочку
|
||||
if (status >= 300 && status < 400 && resp.request().resourceType() === 'document') {
|
||||
const piece = buildRedirectChainForResponse(resp, MAX_REDIRECT_STEPS + 5);
|
||||
redirectLog.push(...piece);
|
||||
|
|
@ -271,8 +295,7 @@ async function scanWithBrowser(originDomain, startUrl, contextOpts = {}) {
|
|||
}
|
||||
}
|
||||
|
||||
// Если наш «ограничитель» вернул 508 — считаем как превышение редиректов
|
||||
if (response && response.status() === 508) {
|
||||
if (response && response.status && response.status() === 508) {
|
||||
throw new Error(`Too many redirects (${MAX_REDIRECT_STEPS})`);
|
||||
}
|
||||
|
||||
|
|
@ -282,7 +305,6 @@ async function scanWithBrowser(originDomain, startUrl, contextOpts = {}) {
|
|||
if (visitedUrls.has(finalUrl)) throw new Error('Redirect loop detected');
|
||||
visitedUrls.add(finalUrl);
|
||||
|
||||
// Проверка цепочки только по документам
|
||||
const steps = redirectLog.length;
|
||||
if (steps > MAX_REDIRECT_STEPS) throw new Error(`Too many redirects (${steps})`);
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue