// server.js const express = require('express'); const { chromium } = require('playwright'); const Database = require('better-sqlite3'); // Убираем punycode; используем WHATWG URL + domainToASCII const { URL, domainToASCII } = require('node:url'); const app = express(); // ---------- Config ---------- const PORT = Number(process.env.PORT || 3000); const CHROMIUM_PATH = process.env.CHROMIUM_PATH || undefined; const CACHE_TTL_SECONDS = parseInt(process.env.CACHE_TTL_SECONDS || '21600', 10); const MAX_REDIRECT_STEPS = parseInt(process.env.MAX_REDIRECT_STEPS || '20', 10); const NAV_TIMEOUT_MS = parseInt(process.env.NAV_TIMEOUT_MS || '30000', 10); const QUIET_WINDOW_MS = parseInt(process.env.QUIET_WINDOW_MS || '700', 10); const PRECHECK_MAX_REDIRECTS = parseInt(process.env.PRECHECK_MAX_REDIRECTS || '15', 10); const SQLITE_PATH = process.env.SQLITE_PATH || './cache.db'; const DEBUG_ENABLED = String(process.env.DEBUG || '').trim() === '1'; const CHROMIUM_ARGS = [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu', '--no-zygote', ]; // ---------- Logging ---------- const log = { info: (...a) => console.log(...a), debug: (...a) => { if (DEBUG_ENABLED) console.log(...a); }, warn: (...a) => console.warn(...a), error: (...a) => console.error(...a), }; // ---------- DB ---------- log.info(`[BOOT] SQLite path: ${SQLITE_PATH}`); const db = new Database(SQLITE_PATH); db.pragma('journal_mode = WAL'); db.exec(` CREATE TABLE IF NOT EXISTS domain_cache ( domain TEXT PRIMARY KEY, result_json TEXT NOT NULL, final_url TEXT, redirect_chain_json TEXT, updated_at INTEGER NOT NULL, ttl_at INTEGER NOT NULL ); `); const stmtSelect = db.prepare(` SELECT result_json, final_url, redirect_chain_json, updated_at, ttl_at FROM domain_cache WHERE domain = ? `); const stmtUpsert = db.prepare(` INSERT INTO domain_cache (domain, result_json, final_url, redirect_chain_json, updated_at, ttl_at) VALUES (@domain, @result_json, @final_url, @redirect_chain_json, @updated_at, @ttl_at) ON CONFLICT(domain) DO UPDATE SET result_json = excluded.result_json, final_url = excluded.final_url, redirect_chain_json = excluded.redirect_chain_json, updated_at = excluded.updated_at, ttl_at = excluded.ttl_at `); app.use(express.json()); // ---------- Helpers ---------- function normalizeDomain(input) { if (!input || typeof input !== 'string') return null; const s = input.trim().toLowerCase(); try { // Если это URL, берём hostname; иначе считаем, что это просто хост const asUrl = /^https?:\/\//i.test(s) ? s : `https://${s}`; const u = new URL(asUrl); // Преобразуем к IDNA ASCII (Punycode) через WHATWG util const ascii = domainToASCII(u.hostname || ''); return ascii || null; } catch { // Попытка прямой IDNA-конверсии из строки (на случай голого хоста без схемы) try { const ascii = domainToASCII(s); return ascii || null; } catch { return null; } } } // WHATWG URL + url.domainToASCII [web:167][web:161][web:164] function extractDomain(url) { try { return new URL(url).hostname.toLowerCase(); } catch { return null; } } // эвристика «выглядит как файл» function looksLikeFilePath(u) { try { const { pathname } = new URL(u); return /\.(?:zip|pdf|png|jpe?g|gif|webp|svg|mp4|mp3|wav|csv|xlsx?|docx?|pptx?|exe|deb|rpm|apk|tar(?:\.gz)?|7z|gz|bz2)$/i.test(pathname); } catch { return false; } } // канонизация URL для детекции петель function normalizeUrlForLoop(u) { try { const x = new URL(u); x.hash = ''; return x.toString(); } catch { return u; } } // ---------- Precheck: manual redirects & classification ---------- async function precheckFollowManually(startUrl) { let url = startUrl; const visited = new Set(); let sawHtmlHint = false; for (let i = 0; i < PRECHECK_MAX_REDIRECTS; i++) { const norm = normalizeUrlForLoop(url); if (visited.has(norm)) { log.debug(`[PRECHECK] Loop at ${norm}`); return { skip: true, reason: 'redirect-loop', tryBrowser: sawHtmlHint }; } visited.add(norm); let res; try { res = await fetch(url, { method: 'GET', redirect: 'manual' }); } catch (e) { log.debug(`[PRECHECK] GET(manual) failed for ${url}: ${e?.message}`); return { skip: false, reason: null, tryBrowser: false }; } const status = res.status; const ct = res.headers.get('content-type') || ''; const cd = res.headers.get('content-disposition') || ''; const loc = res.headers.get('location') || ''; log.debug(`[PRECHECK] step=${i} status=${status} ct="${ct}" cd="${cd || '-'}" loc="${loc || '-'}"`); const isHtml = /\btext\/html\b/i.test(ct); if (isHtml) sawHtmlHint = true; const isAttachment = /attachment/i.test(cd); if (status === 403) { return { skip: true, reason: 'forbidden', tryBrowser: true }; } if (status >= 300 && status < 400 && loc) { const next = new URL(loc, url).toString(); if (looksLikeFilePath(next) || /download|file|export/i.test(next)) { return { skip: true, reason: `redirect-to-file(${next})`, tryBrowser: false, finalUrl: next }; } try { const probe = await fetch(next, { method: 'GET', redirect: 'manual' }); const pct = probe.headers.get('content-type') || ''; const isHtmlTarget = /\btext\/html\b/i.test(pct); if (isHtmlTarget) { return { skip: true, reason: `marketing-redirect(${next})`, tryBrowser: false, finalUrl: next }; } } catch {} url = next; continue; } if (isAttachment) return { skip: true, reason: 'attachment', tryBrowser: false, finalUrl: url }; if (!isHtml && ct) return { skip: true, reason: `non-HTML (${ct})`, tryBrowser: false, finalUrl: url }; return { skip: false, reason: null, tryBrowser: false, finalUrl: url }; } log.debug(`[PRECHECK] Too many redirects >= ${PRECHECK_MAX_REDIRECTS}`); return { skip: true, reason: `redirect-loop(${PRECHECK_MAX_REDIRECTS})`, tryBrowser: sawHtmlHint, finalUrl: null }; } // [web:167] // ---------- Browser lifecycle ---------- let browser; async function ensureBrowser() { if (browser && browser.isConnected()) return browser; if (browser) { try { await browser.close(); } catch {} } log.info(`[BROWSER] Launch headless Chromium`); browser = await chromium.launch({ executablePath: CHROMIUM_PATH, headless: true, args: CHROMIUM_ARGS }); return browser; } // [web:151] // ---------- Redirect chain builder (document-only) ---------- function buildRedirectChainForResponse(resp, maxLen = 50) { const chain = []; const req = resp.request(); if (req.resourceType() !== 'document') return chain; let prev = req.redirectedFrom(); let toUrl = req.url(); const status = resp.status(); while (prev) { chain.push({ from: prev.url(), to: toUrl, status }); toUrl = prev.url(); prev = prev.redirectedFrom(); if (chain.length >= maxLen) break; } return chain.reverse(); } // [web:151] // ---------- Quiet network window ---------- async function quietWindowWait({ inflightRef, lastChangeRef, timeoutMs, quietMs }) { const start = Date.now(); while (Date.now() - start < timeoutMs) { const quietFor = Date.now() - lastChangeRef.value; if (inflightRef.value === 0 && quietFor >= quietMs) return; await new Promise(r => setTimeout(r, 100)); } } // [web:151] // ---------- Core scan with Playwright ---------- async function scanWithBrowser(originDomain, startUrl, contextOpts = {}) { const b = await ensureBrowser(); const context = await b.newContext({ acceptDownloads: true, ...contextOpts }); // Безопасный лимитер редиректов для документной навигации await context.route('**', async route => { const request = route.request(); const isDoc = request.resourceType() === 'document'; const isNav = request.isNavigationRequest(); if (!(isDoc && isNav)) return route.continue(); try { const resp = await route.fetch({ maxRedirects: MAX_REDIRECT_STEPS }); const status = resp.status(); const headers = await resp.headers(); const body = await resp.body().catch(() => null); try { await route.fulfill({ status, headers, body }); } catch (e) { log.debug(`[ROUTE] fulfill failed for ${request.url()}: ${e?.message || e}`); await route.continue(); } } catch (e) { const msg = String(e?.message || ''); if (/redirect/i.test(msg) || /too many/i.test(msg)) { try { await route.fulfill({ status: 508, contentType: 'text/plain', body: 'Loop Detected: too many redirects' }); } catch (e2) { log.debug(`[ROUTE] fulfill(508) failed for ${request.url()}: ${e2?.message || e2}`); await route.continue(); } } else { log.debug(`[ROUTE] fetch failed for ${request.url()}: ${msg}`); await route.continue(); } } }); const page = await context.newPage(); const seenDomains = new Set(); const redirectLog = []; const visitedUrls = new Set(); const inflightRef = { value: 0 }; const lastChangeRef = { value: Date.now() }; if (DEBUG_ENABLED) { page.on('console', msg => log.debug(`[PAGE.CONSOLE] ${msg.type()}: ${msg.text()}`)); page.on('pageerror', err => log.debug(`[PAGE.ERROR] ${err?.message}`)); page.on('requestfailed', req => log.debug(`[REQ.FAIL] ${req.url()} reason=${req.failure()?.errorText}`)); } page.on('download', async dl => { try { await dl.failure().catch(() => {}); } catch {} log.debug(`[SCAN] Download ignored: ${dl.url()}`); }); const onReq = req => { inflightRef.value++; lastChangeRef.value = Date.now(); const d = extractDomain(req.url()); if (d) seenDomains.add(d); log.debug(`[REQ] ${req.method()} ${req.url()}`); }; const onResp = resp => { inflightRef.value = Math.max(0, inflightRef.value - 1); lastChangeRef.value = Date.now(); const d = extractDomain(resp.url()); if (d) seenDomains.add(d); const status = resp.status(); log.debug(`[RESP] ${status} ${resp.url()}`); if (status >= 300 && status < 400 && resp.request().resourceType() === 'document') { const piece = buildRedirectChainForResponse(resp, MAX_REDIRECT_STEPS + 5); redirectLog.push(...piece); } }; page.on('request', onReq); page.on('response', onResp); try { log.info(`[SCAN] goto(${startUrl}) domcontentloaded timeout=${NAV_TIMEOUT_MS}`); let response; try { response = await page.goto(startUrl, { waitUntil: 'domcontentloaded', timeout: NAV_TIMEOUT_MS }); } catch (e) { const msg = String(e?.message || ''); if (/Download is starting/i.test(msg)) { log.info(`[SCAN] goto triggered download; continue as non-HTML`); } else { throw e; } } if (response && response.status && response.status() === 508) { throw new Error(`Too many redirects (${MAX_REDIRECT_STEPS})`); } await quietWindowWait({ inflightRef, lastChangeRef, timeoutMs: NAV_TIMEOUT_MS, quietMs: QUIET_WINDOW_MS }); const finalUrl = page.url(); if (visitedUrls.has(finalUrl)) throw new Error('Redirect loop detected'); visitedUrls.add(finalUrl); const steps = redirectLog.length; if (steps > MAX_REDIRECT_STEPS) throw new Error(`Too many redirects (${steps})`); await context.close(); const relatedDomains = Array.from(seenDomains) .filter(d => !d.includes('doubleclick') && !d.includes('google')) .sort(); log.info(`[SCAN] Done finalUrl=${finalUrl} domains=${relatedDomains.length} redirects=${steps}`); return { finalUrl, relatedDomains, redirectChain: redirectLog }; } catch (e) { try { await context.close(); } catch {} try { if (browser && typeof browser.isConnected === 'function' && !browser.isConnected()) { await browser.close(); browser = null; } } catch {} log.error(`[SCAN] Error: ${e?.message}`); throw e; } finally { page.off('request', onReq); page.off('response', onResp); } } // ---------- High-level scan with precheck and escalation ---------- async function scanDomainOnce(originDomain) { const startUrl = `https://${originDomain}`; log.info(`[SCAN] Start domain="${originDomain}" url=${startUrl}`); const pre = await precheckFollowManually(startUrl); if (pre.skip && (pre.reason === 'attachment' || (pre.reason || '').startsWith('non-HTML'))) { log.info(`[SCAN] Skip non-HTML/attachment: ${pre.reason}`); return { finalUrl: pre.finalUrl || startUrl, relatedDomains: [originDomain], redirectChain: [], precheck: pre.reason }; } let targetUrl = startUrl; if (pre.skip && /^marketing-redirect/.test(pre.reason || '') && pre.finalUrl) { log.info(`[SCAN] Marketing redirect -> follow target in browser: ${pre.finalUrl}`); targetUrl = pre.finalUrl; } else if (pre.skip && pre.tryBrowser) { log.info(`[SCAN] Escalation to browser due to ${pre.reason}`); } const contextOpts = { userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', locale: 'en-US', timezoneId: 'UTC', }; try { const result = await scanWithBrowser(originDomain, targetUrl, contextOpts); if (!result.relatedDomains.includes(originDomain)) { result.relatedDomains.unshift(originDomain); } return result; } catch (e) { log.warn(`[SCAN] Browser escalation failed: ${e?.message}`); return { finalUrl: targetUrl, relatedDomains: [originDomain], redirectChain: [], precheck: pre.reason || 'blocked' }; } } // ---------- Cache helpers ---------- function getFromCache(domain) { const row = stmtSelect.get(domain); if (!row) return null; const now = Math.floor(Date.now() / 1000); if (row.ttl_at > now) { try { const out = { relatedDomains: JSON.parse(row.result_json), finalUrl: row.final_url || null, redirectChain: row.redirect_chain_json ? JSON.parse(row.redirect_chain_json) : [], cached: true, cachedAt: row.updated_at, ttlAt: row.ttl_at, }; return out; } catch (e) { log.warn(`[CACHE] Parse error: ${e?.message}`); return null; } } return null; } function putToCache(domain, result) { const now = Math.floor(Date.now() / 1000); const ttlAt = now + CACHE_TTL_SECONDS; try { stmtUpsert.run({ domain, result_json: JSON.stringify(result.relatedDomains || []), final_url: result.finalUrl || null, redirect_chain_json: JSON.stringify(result.redirectChain || []), updated_at: now, ttl_at: ttlAt, }); log.info(`[CACHE] Upsert ${domain} ttlAt=${ttlAt}`); } catch (e) { log.warn(`[CACHE] Upsert error: ${e?.message}`); } } // ---------- Routes ---------- app.get('/domains', async (req, res) => { res.type('application/json'); const raw = req.query.domain; log.info(`[HTTP] /domains?domain=${raw}`); const domain = normalizeDomain(raw); if (!domain) { res.status(400).json({ error: '"domain" must be a valid hostname', code: 'BAD_DOMAIN' }); return; } const HARD_TIMEOUT = parseInt(process.env.HARD_TIMEOUT_MS || '70000', 10); const hardTimer = setTimeout(() => { try { if (!res.headersSent) res.status(504).json({ error: 'Gateway Timeout', code: 'TIMEOUT' }); } catch {} }, HARD_TIMEOUT); try { const cached = getFromCache(domain); if (cached) { log.info(`[HTTP] Cache HIT ${domain}`); res.status(200).json({ domain, finalUrl: cached.finalUrl, relatedDomains: cached.relatedDomains, redirectChain: cached.redirectChain, cached: true, cachedAt: cached.cachedAt, ttlAt: cached.ttlAt, status: 'ok' }); return; } const result = await scanDomainOnce(domain); if (result.precheck) { if ((result.precheck || '').startsWith('marketing-redirect')) { res.status(200).json({ domain, finalUrl: result.finalUrl || `https://${domain}`, relatedDomains: [domain], redirectChain: [], cached: false, status: 'ok', note: result.precheck }); return; } res.status(200).json({ domain, finalUrl: result.finalUrl || `https://${domain}`, relatedDomains: [domain], redirectChain: [], cached: false, status: (result.precheck === 'forbidden' || result.precheck === 'blocked') ? 'blocked' : 'skipped', reason: result.precheck }); return; } putToCache(domain, result); res.status(200).json({ domain, finalUrl: result.finalUrl, relatedDomains: result.relatedDomains, redirectChain: result.redirectChain, cached: false, status: 'ok' }); } catch (e) { const msg = String(e?.message || 'Internal error'); log.error(`[HTTP] Error for ${domain}: ${msg}`); const forbidden = /403|forbidden|blocked/i.test(msg); res.status(forbidden ? 403 : 500).json({ error: forbidden ? 'Forbidden' : 'Internal server error', code: forbidden ? 'FORBIDDEN' : 'INTERNAL', details: msg }); } finally { clearTimeout(hardTimer); } }); app.get('/health', (_req, res) => { res.type('application/json'); res.json({ ok: true }); }); // ---------- Signals ---------- process.on('SIGTERM', async () => { log.info('[SIGNAL] SIGTERM'); try { if (browser) await browser.close(); } catch {} process.exit(0); }); process.on('SIGINT', async () => { log.info('[SIGNAL] SIGINT'); try { if (browser) await browser.close(); } catch {} process.exit(0); }); // ---------- Start ---------- app.listen(PORT, () => { log.info(`Domain scanner service listening on port ${PORT}`); });