diff --git a/server.js b/server.js index 65bdc9c..7036f09 100644 --- a/server.js +++ b/server.js @@ -2,26 +2,46 @@ const express = require('express'); const { chromium } = require('playwright'); const Database = require('better-sqlite3'); +const punycode = require('punycode/'); const app = express(); -const port = process.env.PORT || 3000; -const executablePath = process.env.CHROMIUM_PATH || undefined; -const chromiumArgs = [ + +// ---------- Config ---------- +const PORT = Number(process.env.PORT || 3000); +const CHROMIUM_PATH = process.env.CHROMIUM_PATH || undefined; +const CACHE_TTL_SECONDS = parseInt(process.env.CACHE_TTL_SECONDS || '21600', 10); +// Важно: это реальный лимит редиректов для документной навигации +const MAX_REDIRECT_STEPS = parseInt(process.env.MAX_REDIRECT_STEPS || '20', 10); +const NAV_TIMEOUT_MS = parseInt(process.env.NAV_TIMEOUT_MS || '30000', 10); +const QUIET_WINDOW_MS = parseInt(process.env.QUIET_WINDOW_MS || '700', 10); +const PRECHECK_MAX_REDIRECTS = parseInt(process.env.PRECHECK_MAX_REDIRECTS || '15', 10); +const SQLITE_PATH = process.env.SQLITE_PATH || './cache.db'; +const DEBUG_ENABLED = String(process.env.DEBUG || '').trim() === '1'; +const CHROMIUM_ARGS = [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu', '--no-zygote', ]; -const CACHE_TTL_SECONDS = parseInt(process.env.CACHE_TTL_SECONDS || '21600', 10); -const MAX_REDIRECT_STEPS = parseInt(process.env.MAX_REDIRECT_STEPS || '20', 10); // анти-цикл по глубине -const db = new Database(process.env.SQLITE_PATH || './cache.db'); + +// ---------- Logging ---------- +const log = { + info: (...a) => console.log(...a), + debug: (...a) => { if (DEBUG_ENABLED) console.log(...a); }, + warn: (...a) => console.warn(...a), + error: (...a) => console.error(...a), +}; + +// ---------- DB ---------- +log.info(`[BOOT] SQLite path: ${SQLITE_PATH}`); +const db = new Database(SQLITE_PATH); db.pragma('journal_mode = WAL'); db.exec(` CREATE TABLE IF NOT EXISTS domain_cache ( domain TEXT PRIMARY KEY, - result_json TEXT NOT NULL, -- JSON массива связанных доменов + result_json TEXT NOT NULL, final_url TEXT, - redirect_chain_json TEXT, -- JSON журнала редиректов + redirect_chain_json TEXT, updated_at INTEGER NOT NULL, ttl_at INTEGER NOT NULL ); @@ -40,132 +60,356 @@ ON CONFLICT(domain) DO UPDATE SET updated_at = excluded.updated_at, ttl_at = excluded.ttl_at `); + app.use(express.json()); + +// ---------- Helpers ---------- +function normalizeDomain(input) { + if (!input || typeof input !== 'string') return null; + const s = input.trim().toLowerCase(); + try { + const u = new URL(/^https?:\/\//i.test(s) ? s : `https://${s}`); + return punycode.toASCII(u.hostname) || null; + } catch { + try { return punycode.toASCII(s) || null; } catch { return null; } + } +} // [Express/Node JSON response patterns] [4] + function extractDomain(url) { - try { return new URL(url).hostname; } catch { return null; } + try { return new URL(url).hostname.toLowerCase(); } catch { return null; } } + +// эвристика «выглядит как файл» +function looksLikeFilePath(u) { + try { + const { pathname } = new URL(u); + return /\.(?:zip|pdf|png|jpe?g|gif|webp|svg|mp4|mp3|wav|csv|xlsx?|docx?|pptx?|exe|deb|rpm|apk|tar(?:\.gz)?|7z|gz|bz2)$/i.test(pathname); + } catch { return false; } +} + +// канонизация URL для детекции петель +function normalizeUrlForLoop(u) { + try { + const x = new URL(u); + x.hash = ''; + return x.toString(); + } catch { return u; } +} + +// ---------- Precheck: manual redirects & classification ---------- +async function precheckFollowManually(startUrl) { + let url = startUrl; + const visited = new Set(); + let sawHtmlHint = false; + for (let i = 0; i < PRECHECK_MAX_REDIRECTS; i++) { + const norm = normalizeUrlForLoop(url); + if (visited.has(norm)) { + log.debug(`[PRECHECK] Loop at ${norm}`); + return { skip: true, reason: 'redirect-loop', tryBrowser: sawHtmlHint }; + } + visited.add(norm); + let res; + try { + res = await fetch(url, { method: 'GET', redirect: 'manual' }); + } catch (e) { + log.debug(`[PRECHECK] GET(manual) failed for ${url}: ${e?.message}`); + return { skip: false, reason: null, tryBrowser: false }; + } + const status = res.status; + const ct = res.headers.get('content-type') || ''; + const cd = res.headers.get('content-disposition') || ''; + const loc = res.headers.get('location') || ''; + log.debug(`[PRECHECK] step=${i} status=${status} ct="${ct}" cd="${cd || '-'}" loc="${loc || '-'}"`); + const isHtml = /\btext\/html\b/i.test(ct); + if (isHtml) sawHtmlHint = true; + const isAttachment = /attachment/i.test(cd); + if (status === 403) { + return { skip: true, reason: 'forbidden', tryBrowser: true }; + } + if (status >= 300 && status < 400 && loc) { + const next = new URL(loc, url).toString(); + if (looksLikeFilePath(next) || /download|file|export/i.test(next)) { + return { skip: true, reason: `redirect-to-file(${next})`, tryBrowser: false, finalUrl: next }; + } + try { + const probe = await fetch(next, { method: 'GET', redirect: 'manual' }); + const pct = probe.headers.get('content-type') || ''; + const isHtmlTarget = /\btext\/html\b/i.test(pct); + if (isHtmlTarget) { + return { skip: true, reason: `marketing-redirect(${next})`, tryBrowser: false, finalUrl: next }; + } + } catch {} + url = next; + continue; + } + if (isAttachment) return { skip: true, reason: 'attachment', tryBrowser: false, finalUrl: url }; + if (!isHtml && ct) return { skip: true, reason: `non-HTML (${ct})`, tryBrowser: false, finalUrl: url }; + return { skip: false, reason: null, tryBrowser: false, finalUrl: url }; + } + log.debug(`[PRECHECK] Too many redirects >= ${PRECHECK_MAX_REDIRECTS}`); + return { skip: true, reason: `redirect-loop(${PRECHECK_MAX_REDIRECTS})`, tryBrowser: sawHtmlHint, finalUrl: null }; +} // [Navigations & heuristics / handling redirects] [4] + +// ---------- Browser lifecycle ---------- let browser; -async function getBrowser() { +async function ensureBrowser() { if (browser && browser.isConnected()) return browser; - browser = await chromium.launch({ - executablePath, - headless: true, - args: chromiumArgs, - }); + if (browser) { try { await browser.close(); } catch {} } + log.info(`[BROWSER] Launch headless Chromium`); + browser = await chromium.launch({ executablePath: CHROMIUM_PATH, headless: true, args: CHROMIUM_ARGS }); return browser; -} -// Вспомогательная функция для сборки полного журнала редиректов через цепочку redirectedFrom() -function buildRedirectChainForResponse(resp) { +} // [Playwright best practices] [13] + +// ---------- Redirect chain builder (document-only) ---------- +function buildRedirectChainForResponse(resp, maxLen = 50) { const chain = []; - const currentReq = resp.request(); - let prev = currentReq.redirectedFrom(); - let toUrl = currentReq.url(); + // Учитываем цепочку только для документной навигации + const req = resp.request(); + if (req.resourceType() !== 'document') return chain; + let prev = req.redirectedFrom(); + let toUrl = req.url(); const status = resp.status(); while (prev) { chain.push({ from: prev.url(), to: toUrl, status }); toUrl = prev.url(); prev = prev.redirectedFrom(); + if (chain.length >= maxLen) break; } return chain.reverse(); -} -async function scanDomainOnce(originDomain) { - const startUrl = `https://${originDomain}`; - const b = await getBrowser(); - const context = await b.newContext(); +} // [Playwright Request.redirectedFrom usage] [12] + +// ---------- Quiet network window ---------- +async function quietWindowWait({ inflightRef, lastChangeRef, timeoutMs, quietMs }) { + const start = Date.now(); + while (Date.now() - start < timeoutMs) { + const quietFor = Date.now() - lastChangeRef.value; + if (inflightRef.value === 0 && quietFor >= quietMs) return; + await new Promise(r => setTimeout(r, 100)); + } +} // [Wait strategy guidance] [14] + +// ---------- Core scan with Playwright ---------- +async function scanWithBrowser(originDomain, startUrl, contextOpts = {}) { + const b = await ensureBrowser(); + const context = await b.newContext({ acceptDownloads: true, ...contextOpts }); + + // Глобальный лимитер редиректов для документных навигаций: + // - для isNavigationRequest() с resourceType 'document' используем route.fetch({ maxRedirects }) + // - ассеты пропускаем без ограничения, чтобы не ломать рендер + await context.route('**', async route => { + const request = route.request(); + const isDoc = request.resourceType() === 'document'; + const isNav = request.isNavigationRequest(); + if (isDoc && isNav) { + try { + const response = await route.fetch({ maxRedirects: MAX_REDIRECT_STEPS }); + return route.fulfill({ response }); + } catch (e) { + // Если maxRedirects сработал, прерываем навигацию «аккуратно» + return route.fulfill({ + status: 508, + body: 'Loop Detected: too many redirects' + }); + } + } + return route.continue(); + }); // [Limit redirects for page.goto via routing] [4][5] + const page = await context.newPage(); + const seenDomains = new Set(); const redirectLog = []; - const visitedUrls = new Set(); // для детекции циклов - let redirectSteps = 0; - // Фиксируем все запросы - page.on('request', req => { + const visitedUrls = new Set(); + const inflightRef = { value: 0 }; + const lastChangeRef = { value: Date.now() }; + + if (DEBUG_ENABLED) { + page.on('console', msg => log.debug(`[PAGE.CONSOLE] ${msg.type()}: ${msg.text()}`)); + page.on('pageerror', err => log.debug(`[PAGE.ERROR] ${err?.message}`)); + page.on('requestfailed', req => log.debug(`[REQ.FAIL] ${req.url()} reason=${req.failure()?.errorText}`)); + } // [Console/request monitoring] [13] + + page.on('download', async dl => { + try { await dl.failure().catch(() => {}); } catch {} + log.debug(`[SCAN] Download ignored: ${dl.url()}`); + }); // [Downloads handling] [13] + + const onReq = req => { + inflightRef.value++; + lastChangeRef.value = Date.now(); const d = extractDomain(req.url()); if (d) seenDomains.add(d); - }); - // Фиксируем ответы и редиректные цепочки - page.on('response', resp => { - const url = resp.url(); - const d = extractDomain(url); + log.debug(`[REQ] ${req.method()} ${req.url()}`); + }; + const onResp = resp => { + inflightRef.value = Math.max(0, inflightRef.value - 1); + lastChangeRef.value = Date.now(); + const d = extractDomain(resp.url()); if (d) seenDomains.add(d); - // Добавим элементы цепочки, если ответ был редиректом (3xx) const status = resp.status(); - if (status >= 300 && status < 400) { - const piece = buildRedirectChainForResponse(resp); + log.debug(`[RESP] ${status} ${resp.url()}`); + // только документные редиректы считаем в цепочку + if (status >= 300 && status < 400 && resp.request().resourceType() === 'document') { + const piece = buildRedirectChainForResponse(resp, MAX_REDIRECT_STEPS + 5); redirectLog.push(...piece); } - }); + }; + page.on('request', onReq); + page.on('response', onResp); + try { - let currentUrl = startUrl; - // Анти-цикл: свой контроль над goto в несколько шагов — через ожидание события navigation и проверку URL - // Однако Playwright следует редиректам сам; для анти-цикла контролируем уникальность URL после перехода - const resp = await page.goto(currentUrl, { waitUntil: 'domcontentloaded', timeout: 30000 }); - // После авто-редиректов Playwright мы проверим фактическую цепочку через обработчики и page.url() - // Защита от «вечных» редиректов: проверим историю URL в performance entries - // Простой и надёжный способ: считать шаги смены URL в waitForNavigation с url predicate — но нам достаточно лимита по постфакту. - // Проверим финальный URL и убедимся, что не было явного зацикливания по уже виденным URL. + log.info(`[SCAN] goto(${startUrl}) domcontentloaded timeout=${NAV_TIMEOUT_MS}`); + let response; + try { + response = await page.goto(startUrl, { waitUntil: 'domcontentloaded', timeout: NAV_TIMEOUT_MS }); + } catch (e) { + const msg = String(e?.message || ''); + if (/Download is starting/i.test(msg)) { + log.info(`[SCAN] goto triggered download; continue as non-HTML`); + } else { + throw e; + } + } + + // Если наш «ограничитель» вернул 508 — считаем как превышение редиректов + if (response && response.status() === 508) { + throw new Error(`Too many redirects (${MAX_REDIRECT_STEPS})`); + } + + await quietWindowWait({ inflightRef, lastChangeRef, timeoutMs: NAV_TIMEOUT_MS, quietMs: QUIET_WINDOW_MS }); const finalUrl = page.url(); - if (visitedUrls.has(finalUrl)) { - throw new Error('Redirect loop detected'); - } + + if (visitedUrls.has(finalUrl)) throw new Error('Redirect loop detected'); visitedUrls.add(finalUrl); - // Как дополнительная защита — лимит по шагам 3xx из собранного redirectLog - // Если цепочка слишком длинная, считаем её небезопасной. - redirectSteps = redirectLog.length; - if (redirectSteps > MAX_REDIRECT_STEPS) { - throw new Error(`Too many redirects (${redirectSteps})`); - } + + // Проверка цепочки только по документам + const steps = redirectLog.length; + if (steps > MAX_REDIRECT_STEPS) throw new Error(`Too many redirects (${steps})`); + await context.close(); + const relatedDomains = Array.from(seenDomains) .filter(d => !d.includes('doubleclick') && !d.includes('google')) .sort(); - return { - finalUrl, - relatedDomains, - redirectChain: redirectLog, - }; + + log.info(`[SCAN] Done finalUrl=${finalUrl} domains=${relatedDomains.length} redirects=${steps}`); + return { finalUrl, relatedDomains, redirectChain: redirectLog }; } catch (e) { try { await context.close(); } catch {} + try { + if (browser && typeof browser.isConnected === 'function' && !browser.isConnected()) { + await browser.close(); browser = null; + } + } catch {} + log.error(`[SCAN] Error: ${e?.message}`); throw e; + } finally { + page.off('request', onReq); + page.off('response', onResp); } } + +// ---------- High-level scan with precheck and escalation ---------- +async function scanDomainOnce(originDomain) { + const startUrl = `https://${originDomain}`; + log.info(`[SCAN] Start domain="${originDomain}" url=${startUrl}`); + const pre = await precheckFollowManually(startUrl); + + if (pre.skip && (pre.reason === 'attachment' || (pre.reason || '').startsWith('non-HTML'))) { + log.info(`[SCAN] Skip non-HTML/attachment: ${pre.reason}`); + return { finalUrl: pre.finalUrl || startUrl, relatedDomains: [originDomain], redirectChain: [], precheck: pre.reason }; + } + + let targetUrl = startUrl; + + if (pre.skip && /^marketing-redirect/.test(pre.reason || '') && pre.finalUrl) { + log.info(`[SCAN] Marketing redirect -> follow target in browser: ${pre.finalUrl}`); + targetUrl = pre.finalUrl; + } else if (pre.skip && pre.tryBrowser) { + log.info(`[SCAN] Escalation to browser due to ${pre.reason}`); + } + + const contextOpts = { + userAgent: + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + locale: 'en-US', + timezoneId: 'UTC', + }; + + try { + const result = await scanWithBrowser(originDomain, targetUrl, contextOpts); + if (!result.relatedDomains.includes(originDomain)) { + result.relatedDomains.unshift(originDomain); + } + return result; + } catch (e) { + log.warn(`[SCAN] Browser escalation failed: ${e?.message}`); + return { finalUrl: targetUrl, relatedDomains: [originDomain], redirectChain: [], precheck: pre.reason || 'blocked' }; + } +} + +// ---------- Cache helpers ---------- function getFromCache(domain) { const row = stmtSelect.get(domain); if (!row) return null; const now = Math.floor(Date.now() / 1000); if (row.ttl_at > now) { - return { - relatedDomains: JSON.parse(row.result_json), - finalUrl: row.final_url || null, - redirectChain: row.redirect_chain_json ? JSON.parse(row.redirect_chain_json) : [], - cached: true, - cachedAt: row.updated_at, - ttlAt: row.ttl_at, - }; + try { + const out = { + relatedDomains: JSON.parse(row.result_json), + finalUrl: row.final_url || null, + redirectChain: row.redirect_chain_json ? JSON.parse(row.redirect_chain_json) : [], + cached: true, + cachedAt: row.updated_at, + ttlAt: row.ttl_at, + }; + return out; + } catch (e) { + log.warn(`[CACHE] Parse error: ${e?.message}`); + return null; + } } return null; } function putToCache(domain, result) { const now = Math.floor(Date.now() / 1000); const ttlAt = now + CACHE_TTL_SECONDS; - stmtUpsert.run({ - domain, - result_json: JSON.stringify(result.relatedDomains || []), - final_url: result.finalUrl || null, - redirect_chain_json: JSON.stringify(result.redirectChain || []), - updated_at: now, - ttl_at: ttlAt, - }); + try { + stmtUpsert.run({ + domain, + result_json: JSON.stringify(result.relatedDomains || []), + final_url: result.finalUrl || null, + redirect_chain_json: JSON.stringify(result.redirectChain || []), + updated_at: now, + ttl_at: ttlAt, + }); + log.info(`[CACHE] Upsert ${domain} ttlAt=${ttlAt}`); + } catch (e) { + log.warn(`[CACHE] Upsert error: ${e?.message}`); + } } + +// ---------- Routes ---------- app.get('/domains', async (req, res) => { - const { domain } = req.query; + res.type('application/json'); + const raw = req.query.domain; + log.info(`[HTTP] /domains?domain=${raw}`); + const domain = normalizeDomain(raw); if (!domain) { - res.status(400).json({ error: '"domain" query parameter is required' }); + res.status(400).json({ error: '"domain" must be a valid hostname', code: 'BAD_DOMAIN' }); return; } + + const HARD_TIMEOUT = parseInt(process.env.HARD_TIMEOUT_MS || '70000', 10); + const hardTimer = setTimeout(() => { + try { if (!res.headersSent) res.status(504).json({ error: 'Gateway Timeout', code: 'TIMEOUT' }); } catch {} + }, HARD_TIMEOUT); + try { const cached = getFromCache(domain); if (cached) { - res.json({ + log.info(`[HTTP] Cache HIT ${domain}`); + res.status(200).json({ domain, finalUrl: cached.finalUrl, relatedDomains: cached.relatedDomains, @@ -173,31 +417,80 @@ app.get('/domains', async (req, res) => { cached: true, cachedAt: cached.cachedAt, ttlAt: cached.ttlAt, + status: 'ok' }); return; } + const result = await scanDomainOnce(domain); + + if (result.precheck) { + if ((result.precheck || '').startsWith('marketing-redirect')) { + res.status(200).json({ + domain, + finalUrl: result.finalUrl || `https://${domain}`, + relatedDomains: [domain], + redirectChain: [], + cached: false, + status: 'ok', + note: result.precheck + }); + return; + } + res.status(200).json({ + domain, + finalUrl: result.finalUrl || `https://${domain}`, + relatedDomains: [domain], + redirectChain: [], + cached: false, + status: (result.precheck === 'forbidden' || result.precheck === 'blocked') ? 'blocked' : 'skipped', + reason: result.precheck + }); + return; + } + putToCache(domain, result); - res.json({ + res.status(200).json({ domain, finalUrl: result.finalUrl, relatedDomains: result.relatedDomains, redirectChain: result.redirectChain, cached: false, + status: 'ok' }); } catch (e) { - res.status(500).json({ error: e.message || 'Internal server error' }); + const msg = String(e?.message || 'Internal error'); + log.error(`[HTTP] Error for ${domain}: ${msg}`); + const forbidden = /403|forbidden|blocked/i.test(msg); + res.status(forbidden ? 403 : 500).json({ + error: forbidden ? 'Forbidden' : 'Internal server error', + code: forbidden ? 'FORBIDDEN' : 'INTERNAL', + details: msg + }); + } finally { + clearTimeout(hardTimer); } }); -app.get('/health', (_req, res) => res.json({ ok: true })); + +app.get('/health', (_req, res) => { + res.type('application/json'); + res.json({ ok: true }); +}); + +// ---------- Signals ---------- process.on('SIGTERM', async () => { + log.info('[SIGNAL] SIGTERM'); try { if (browser) await browser.close(); } catch {} process.exit(0); }); process.on('SIGINT', async () => { + log.info('[SIGNAL] SIGINT'); try { if (browser) await browser.close(); } catch {} process.exit(0); }); -app.listen(port, () => { - console.log(`Domain scanner service listening on port ${port}`); + +// ---------- Start ---------- +app.listen(PORT, () => { + log.info(`Domain scanner service listening on port ${PORT}`); }); +