diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..229f78d --- /dev/null +++ b/.dockerignore @@ -0,0 +1,13 @@ +* + +!package.json + +!package-lock.json + +!server.js + +!ignore-domains.txt + +!LICENSE + +!README.md diff --git a/.forgejo/workflows/docker-build-push.yaml b/.forgejo/workflows/docker-build-push.yaml deleted file mode 100644 index 340cceb..0000000 --- a/.forgejo/workflows/docker-build-push.yaml +++ /dev/null @@ -1,42 +0,0 @@ -name: Build and Push Docker Image - -on: - push: - branches: - - main - paths: - - '**/*' # Триггер при любом изменении репозитория - -jobs: - build-and-push: - runs-on: docker - - container: - image: docker:24.0.1 - - steps: - # Установка Docker CLI (если не в базовом образе) - - name: Setup Docker CLI - run: | - apk add --no-cache docker-cli - - # Авторизация в Docker Hub - токен необходимо добавить в Secrets - - name: Login to Docker Hub - env: - DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }} - DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} - run: | - echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin - - # Клонирование исходников — встроено в Forgejo Actions - - # Сборка Docker образа - - name: Build Docker Image - run: | - docker build -t ${DOCKER_USERNAME}/playwright-domain-scanner:latest . - - # Push образ на Docker Hub - - name: Push Docker Image - run: | - docker push ${DOCKER_USERNAME}/playwright-domain-scanner:latest - diff --git a/.github/workflows/docker-build-push.yaml b/.github/workflows/docker-build-push.yaml new file mode 100644 index 0000000..be6ac82 --- /dev/null +++ b/.github/workflows/docker-build-push.yaml @@ -0,0 +1,28 @@ +name: Build and Push Docker Image + +on: + push: + branches: + - main + +jobs: + build-and-push: + runs-on: ubuntu-latest + environment: dockerhub + + steps: + - name: Checkout the repository + uses: actions/checkout@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Build and push Docker image + uses: docker/build-push-action@v4 + with: + push: true + tags: ${{ secrets.DOCKER_USERNAME }}/gekata:latest + diff --git a/Dockerfile b/Dockerfile index f95e58b..628bd48 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,46 +1,65 @@ -# Use official Node.js LTS base image -FROM node:20-slim +# -------- Builder stage -------- +FROM debian:bookworm-slim AS builder +ENV DEBIAN_FRONTEND=noninteractive -# Install dependencies for running Chromium -RUN apt-get update && apt-get install -y \ - ca-certificates \ - fonts-liberation \ - libasound2 \ - libatk-bridge2.0-0 \ - libatk1.0-0 \ - libcups2 \ - libdbus-1-3 \ - libdrm2 \ - libgbm1 \ - libgtk-3-0 \ - libnspr4 \ - libnss3 \ - libx11-xcb1 \ - libxcomposite1 \ - libxdamage1 \ - libxrandr2 \ - xdg-utils \ - wget \ - --no-install-recommends && \ - rm -rf /var/lib/apt/lists/* +# Node + build tools for native modules (better-sqlite3) +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates curl gnupg \ + nodejs npm \ + python3 make g++ pkg-config libsqlite3-dev \ + && rm -rf /var/lib/apt/lists/* -# Set working directory -WORKDIR /usr/src/app +WORKDIR /app -# Copy package files and install dependencies +# Copy only manifests first to leverage Docker cache COPY package*.json ./ -COPY ignore-domains.txt ./ -RUN npm ci -# Install Playwright browsers (Chromium) -RUN npx playwright install chromium +# Install production deps (build native modules here) +ENV CI=true +RUN npm ci --omit=dev -# Copy app sources +# Copy source COPY . . -# Expose port +# -------- Runtime stage -------- +FROM debian:bookworm-slim +ENV DEBIAN_FRONTEND=noninteractive + +# Install tini for proper PID 1 and signal handling +# Install Node.js runtime, Chromium and minimal libs +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates curl gnupg \ + tini \ + nodejs npm \ + chromium \ + libx11-6 libxcomposite1 libxdamage1 libxrandr2 libxkbcommon0 \ + libgtk-3-0 libnss3 libdrm2 libgbm1 libasound2 fonts-liberation \ + fonts-dejavu-core \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy node_modules and app from builder +COPY --from=builder /app/node_modules /app/node_modules +COPY --from=builder /app/package*.json /app/ +COPY . . + +# Security: drop root +RUN useradd -ms /bin/bash nodeuser && chown -R nodeuser:nodeuser /app +USER nodeuser + +# Environment +ENV PORT=3000 \ + PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 \ + PLAYWRIGHT_BROWSERS_PATH=0 \ + CHROMIUM_PATH=/usr/bin/chromium \ + CACHE_TTL_SECONDS=21600 + EXPOSE 3000 -# Run the service -CMD ["npm", "start"] +# Use tini as PID 1 so we don't need `--init` +ENTRYPOINT ["/usr/bin/tini", "--"] + +# Start the service +CMD ["node", "server.js"] diff --git a/README.md b/README.md index 2990f20..874aedf 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,117 @@ -# gekata +**Gekata** — это легковесный сервис на Node.js для извлечения «связных доменов» с веб-страниц, запускаемый в контейнере Debian с Chromium; он сначала делает предзапросы с ручным следованием редиректам, затем при необходимости поднимает безголовый браузер, ограничивает глубину редиректов и кэширует результаты в SQLite через better-sqlite3. Сервис предоставляет HTTP API /domains, возвращающее финальный URL, цепочку редиректов и список связанных доменов, а также /health для проверки готовности. -Extract all domains from site \ No newline at end of file +### Назначение + +Gekata сканирует заданный домен, разрешает маркетинговые и другие редиректы до целевой HTML-страницы, загружает её в безголовом Chromium и собирает множество доменных имён из всех сетевых запросов страницы, формируя список «связных доменов» для анализа интеграций, трекинга и CDN. Такой подход работает и для динамических сайтов с клиентским рендерингом. + +### Архитектура + +- Веб-сервер на Express предоставляет REST‑маршруты, принимает домен, валидирует его и инициирует сканирование, обрабатывая таймауты и коды ошибок. +- Предпроверка делает GET с ручным управлением редиректами, классифицируя сценарии: форс‑редиректы «маркетинга», запреты 403, скачивания и не‑HTML контент, чтобы экономить запуск браузера. +- Эскалация в безголовый Chromium (через Playwright) выполняет навигацию, применяя ограничение глубины редиректов только для документных переходов и ожидая «тихое окно» сети для стабильного сбора доменов. +- Кэширование результатов в SQLite с TTL ускоряет повторные запросы; используется лучшее для продакшна подключение better-sqlite3 и WAL‑журналирование для устойчивости. + + +### Потоки данных + +- Вход: GET /domains?domain= — принимает хост или URL, нормализует до ASCII/Punycode и формирует стартовый https:// URL. +- Предобработка: ручной обход 3xx с ограничением шагов; детекция «похожих на файл» ссылок и контента non‑HTML; маркетинговый редирект помечается и может быть целевой. +- Сканирование браузером: навигация на целевой URL, слежение за запросами/ответами страницы, сбор доменов из всех сетевых событий, исключая шум (google/doubleclick по эвристике), построение цепочки редиректов для документной навигации. +- Выход: JSON с finalUrl, relatedDomains[], redirectChain[], статусами ok/skipped/blocked и служебными пометками (cached, ttl). + + +### API + +- GET /domains +Параметры: domain — доменное имя или URL. +Ответ 200 ok: + - domain: нормализованный запрошенный домен. + - finalUrl: конечный URL после редиректов/навигации. + - relatedDomains: уникальные домены, замеченные при загрузке страницы. + - redirectChain: массив { from, to, status } для документных 3xx. + - cached: true/false, cachedAt, ttlAt. + - status: ok | skipped | blocked; дополнительные note/reason при skip/blocked. +- GET /health — простой JSON { ok: true } для readiness/liveness. + + +### Обработка редиректов + +- На этапе предпроверки ограничение PRECHECK_MAX_REDIRECTS предотвращает бесконечные цепочки до запуска браузера; 403 заставляет эскалировать в браузер, non‑HTML/attachment возвращают немедленный ответ. +- В браузере включён маршрут‑ограничитель только для документной навигации: запросы навигации обрабатываются с maxRedirects, ассеты идут без ограничений, чтобы не ломать рендеринг. +- Если лимит превышен, навигация завершается контролируемо и возвращается ошибка «Too many redirects», переводимая в понятный статус ответа API. + + +### Кэш и TTL + +- SQLite таблица domain_cache хранит: домен, JSON списка доменов, финальный URL, цепочку редиректов, время обновления и ttl_at. +- Повторные обращения до истечения TTL возвращают сохранённый результат без запуска браузера, снижая задержки и нагрузку. + + +### Контейнеризация + +- Образ состоит из двух стадий: builder и runtime, обе на debian:bookworm-slim. +- Стадия builder устанавливает Node.js, компилятор и заголовки SQLite для сборки native‑модуля better‑sqlite3, затем выполняет npm ci с пропуском dev‑зависимостей и копирует исходники. +- Стадия runtime устанавливает tini как корректный PID 1, Node.js runtime, системный Chromium и минимальный набор X/GTK/NSS/GBM/шрифтов, необходимых для безголового режима; копируются node_modules и исходники из builder. +- Создаётся непривилегированный пользователь nodeuser; директория приложения принадлежит ему; сервис запускается не от root. + + +### Переменные окружения + +- PORT — порт HTTP сервера (по умолчанию 3000). +- CHROMIUM_PATH — путь к системному Chromium (/usr/bin/chromium в контейнере). +- CACHE_TTL_SECONDS — срок жизни кэша (по умолчанию 6 часов). +- HARD_TIMEOUT_MS — жёсткий таймаут обработки HTTP‑запроса (по умолчанию 70 секунд). +- MAX_REDIRECT_STEPS — максимальная глубина редиректов для документной навигации (по умолчанию 20). +- NAV_TIMEOUT_MS, QUIET_WINDOW_MS — таймауты навигации и «тихого окна» сети. +- DEBUG — включает подробные логи страницы/сетевых событий при значении 1. + + +### Безопасность и устойчивость + +- tini как init обрабатывает сигналы и «зомби» процессы; контейнер корректно завершает Chromium по SIGTERM/SIGINT, предотвращая утечки. +- Запуск под непривилегированным пользователем снижает риск компрометации; Chromium стартует с флагами no‑sandbox/disable‑setuid-sandbox, что совместимо с безпривилегированным окружением контейнеров. +- Ограничение редиректов для документных переходов устраняет зацикливание «маркетинговых» и неверных конфигураций, не влияя на загрузку ассетов. + + +### Производительность + +- npm ci в builder‑стадии плюс копирование package*.json до исходников задействуют кэш слоёв Docker, ускоряя сборки. +- better‑sqlite3 с синхронными подготовленными выражениями обеспечивает быстрый локальный кэш без отдельного сервиса БД. +- Предпроверка HTTP избавляет от лишних подъёмов браузера для не‑HTML или «прикреплённых» ответов. + + +### Сборка и запуск + +- Сборка образа: + - docker build -t gekata:latest . +- Запуск контейнера: + - docker run --rm -p 3000:3000 -e CACHE_TTL_SECONDS=21600 -e MAX_REDIRECT_STEPS=20 gekata:latest +- Примеры запросов: + - curl -s "http://localhost:3000/health" + - curl -s "http://localhost:3000/domains?domain=forum.xda-developers.com" + + +### Журналирование и диагностика + +- Лог‑метки [BOOT], [HTTP], [SCAN], [BROWSER], [CACHE], [SIGNAL] позволяют быстро локализовать этап и тип события. +- При включённом DEBUG=1 логируются консоль страницы, ошибки, неудавшиеся запросы и сетевые эвенты, что помогает анализировать блокировки, CORS, антибот‑защиту и таймауты. + + +### Ограничения + +- Сайты с жёсткими антибот‑мерами (403/JS‑челленджи) могут быть помечены как blocked или потребовать дополнительной эмуляции (например, иные user‑agent/locale/timezone/proxy). +- Сбор связанных доменов базируется на фактически выполненных сетевых запросах и может меняться при A/B тестах, гео‑таргетинге или различиях по user‑agent. + + +### Расширения и доработки + +- Добавить белый/чёрный список доменов, тонкую фильтрацию трекеров и интеграций. +- Вынести кэш в внешний SQLite‑файл через volume для сохранения между рестартами, настроить резервное копирование. +- Параметризовать user‑agent/locale/timezone и добавить поддержку прокси для региональных сценариев. +- Экспортировать полный сетевой журнал и тайминги (HAR‑подобный формат) как опциональную выгрузку. + + +### Файлы проекта + +- server.js — основной сервис, логика API, предобработка, сканирование браузером, кэш, ограничения редиректов, завершение по сигналам. +- Dockerfile — двухстадийная сборка, системный Chromium в рантайме, tini, непривилегированный пользователь, переменные окружения и запуск службы. diff --git a/ignore-domains.txt b/ignore-domains.txt index bf4888e..d6033b8 100644 --- a/ignore-domains.txt +++ b/ignore-domains.txt @@ -1,3 +1,3 @@ doubleclick google - +yandex diff --git a/package-lock.json b/package-lock.json index 2c45af6..3e08c4c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -8,7 +8,9 @@ "name": "playwright-domain-scanner", "version": "1.0.0", "dependencies": { + "better-sqlite3": "^9.0.0", "express": "^4.18.2", + "express-rate-limit": "^8.1.0", "playwright": "^1.42.0" } }, @@ -29,6 +31,53 @@ "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz", "integrity": "sha512-PCVAQswWemu6UdxsDFFX/+gVeYqKAod3D3UVm91jHwynguOwAvYPhx8nNlM++NqRcK6CxxpUafjmhIdKiHibqg==" }, + "node_modules/base64-js": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", + "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ] + }, + "node_modules/better-sqlite3": { + "version": "9.6.0", + "resolved": "https://registry.npmjs.org/better-sqlite3/-/better-sqlite3-9.6.0.tgz", + "integrity": "sha512-yR5HATnqeYNVnkaUTf4bOP2dJSnyhP4puJN/QPRyx4YkBEEUxib422n2XzPqDEHjQQqazoYoADdAm5vE15+dAQ==", + "hasInstallScript": true, + "dependencies": { + "bindings": "^1.5.0", + "prebuild-install": "^7.1.1" + } + }, + "node_modules/bindings": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/bindings/-/bindings-1.5.0.tgz", + "integrity": "sha512-p2q/t/mhvuOj/UeLlV6566GD/guowlr0hHxClI0W9m7MWYkL1F0hLo+0Aexs9HSPCtR1SXQ0TD3MMKrXZajbiQ==", + "dependencies": { + "file-uri-to-path": "1.0.0" + } + }, + "node_modules/bl": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz", + "integrity": "sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==", + "dependencies": { + "buffer": "^5.5.0", + "inherits": "^2.0.4", + "readable-stream": "^3.4.0" + } + }, "node_modules/body-parser": { "version": "1.20.3", "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.3.tgz", @@ -52,6 +101,29 @@ "npm": "1.2.8000 || >= 1.4.16" } }, + "node_modules/buffer": { + "version": "5.7.1", + "resolved": "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz", + "integrity": "sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "dependencies": { + "base64-js": "^1.3.1", + "ieee754": "^1.1.13" + } + }, "node_modules/bytes": { "version": "3.1.2", "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz", @@ -87,6 +159,11 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/chownr": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz", + "integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==" + }, "node_modules/content-disposition": { "version": "0.5.4", "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.4.tgz", @@ -127,6 +204,28 @@ "ms": "2.0.0" } }, + "node_modules/decompress-response": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz", + "integrity": "sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ==", + "dependencies": { + "mimic-response": "^3.1.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/deep-extend": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/deep-extend/-/deep-extend-0.6.0.tgz", + "integrity": "sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==", + "engines": { + "node": ">=4.0.0" + } + }, "node_modules/depd": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz", @@ -144,6 +243,14 @@ "npm": "1.2.8000 || >= 1.4.16" } }, + "node_modules/detect-libc": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.0.4.tgz", + "integrity": "sha512-3UDv+G9CsCKO1WKMGw9fwq/SWJYbI0c5Y7LU1AXYoDdbhE2AHQ6N6Nb34sG8Fj7T5APy8qXDCKuuIHd1BR0tVA==", + "engines": { + "node": ">=8" + } + }, "node_modules/dunder-proto": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", @@ -170,6 +277,14 @@ "node": ">= 0.8" } }, + "node_modules/end-of-stream": { + "version": "1.4.5", + "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz", + "integrity": "sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg==", + "dependencies": { + "once": "^1.4.0" + } + }, "node_modules/es-define-property": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", @@ -210,6 +325,14 @@ "node": ">= 0.6" } }, + "node_modules/expand-template": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/expand-template/-/expand-template-2.0.3.tgz", + "integrity": "sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==", + "engines": { + "node": ">=6" + } + }, "node_modules/express": { "version": "4.21.2", "resolved": "https://registry.npmjs.org/express/-/express-4.21.2.tgz", @@ -255,6 +378,28 @@ "url": "https://opencollective.com/express" } }, + "node_modules/express-rate-limit": { + "version": "8.1.0", + "resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.1.0.tgz", + "integrity": "sha512-4nLnATuKupnmwqiJc27b4dCFmB/T60ExgmtDD7waf4LdrbJ8CPZzZRHYErDYNhoz+ql8fUdYwM/opf90PoPAQA==", + "dependencies": { + "ip-address": "10.0.1" + }, + "engines": { + "node": ">= 16" + }, + "funding": { + "url": "https://github.com/sponsors/express-rate-limit" + }, + "peerDependencies": { + "express": ">= 4.11" + } + }, + "node_modules/file-uri-to-path": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/file-uri-to-path/-/file-uri-to-path-1.0.0.tgz", + "integrity": "sha512-0Zt+s3L7Vf1biwWZ29aARiVYLx7iMGnEUl9x33fbB/j3jR81u/O2LbqK+Bm1CDSNDKVtJ/YjwY7TUd5SkeLQLw==" + }, "node_modules/finalhandler": { "version": "1.3.1", "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.3.1.tgz", @@ -288,6 +433,11 @@ "node": ">= 0.6" } }, + "node_modules/fs-constants": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz", + "integrity": "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==" + }, "node_modules/fsevents": { "version": "2.3.2", "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", @@ -344,6 +494,11 @@ "node": ">= 0.4" } }, + "node_modules/github-from-package": { + "version": "0.0.0", + "resolved": "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz", + "integrity": "sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==" + }, "node_modules/gopd": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", @@ -403,11 +558,43 @@ "node": ">=0.10.0" } }, + "node_modules/ieee754": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz", + "integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ] + }, "node_modules/inherits": { "version": "2.0.4", "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==" }, + "node_modules/ini": { + "version": "1.3.8", + "resolved": "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz", + "integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==" + }, + "node_modules/ip-address": { + "version": "10.0.1", + "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.0.1.tgz", + "integrity": "sha512-NWv9YLW4PoW2B7xtzaS3NCot75m6nK7Icdv0o3lfMceJVRfSoQwqD4wEH5rLwoKJwUiZ/rfpiVBhnaF0FK4HoA==", + "engines": { + "node": ">= 12" + } + }, "node_modules/ipaddr.js": { "version": "1.9.1", "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz", @@ -478,11 +665,40 @@ "node": ">= 0.6" } }, + "node_modules/mimic-response": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz", + "integrity": "sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/minimist": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz", + "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/mkdirp-classic": { + "version": "0.5.3", + "resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz", + "integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==" + }, "node_modules/ms": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==" }, + "node_modules/napi-build-utils": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/napi-build-utils/-/napi-build-utils-2.0.0.tgz", + "integrity": "sha512-GEbrYkbfF7MoNaoh2iGG84Mnf/WZfB0GdGEsM8wz7Expx/LlWf5U8t9nvJKXSp3qr5IsEbK04cBGhol/KwOsWA==" + }, "node_modules/negotiator": { "version": "0.6.3", "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.3.tgz", @@ -491,6 +707,17 @@ "node": ">= 0.6" } }, + "node_modules/node-abi": { + "version": "3.77.0", + "resolved": "https://registry.npmjs.org/node-abi/-/node-abi-3.77.0.tgz", + "integrity": "sha512-DSmt0OEcLoK4i3NuscSbGjOf3bqiDEutejqENSplMSFA/gmB8mkED9G4pKWnPl7MDU4rSHebKPHeitpDfyH0cQ==", + "dependencies": { + "semver": "^7.3.5" + }, + "engines": { + "node": ">=10" + } + }, "node_modules/object-inspect": { "version": "1.13.4", "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.4.tgz", @@ -513,6 +740,14 @@ "node": ">= 0.8" } }, + "node_modules/once": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", + "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==", + "dependencies": { + "wrappy": "1" + } + }, "node_modules/parseurl": { "version": "1.3.3", "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz", @@ -554,6 +789,31 @@ "node": ">=18" } }, + "node_modules/prebuild-install": { + "version": "7.1.3", + "resolved": "https://registry.npmjs.org/prebuild-install/-/prebuild-install-7.1.3.tgz", + "integrity": "sha512-8Mf2cbV7x1cXPUILADGI3wuhfqWvtiLA1iclTDbFRZkgRQS0NqsPZphna9V+HyTEadheuPmjaJMsbzKQFOzLug==", + "dependencies": { + "detect-libc": "^2.0.0", + "expand-template": "^2.0.3", + "github-from-package": "0.0.0", + "minimist": "^1.2.3", + "mkdirp-classic": "^0.5.3", + "napi-build-utils": "^2.0.0", + "node-abi": "^3.3.0", + "pump": "^3.0.0", + "rc": "^1.2.7", + "simple-get": "^4.0.0", + "tar-fs": "^2.0.0", + "tunnel-agent": "^0.6.0" + }, + "bin": { + "prebuild-install": "bin.js" + }, + "engines": { + "node": ">=10" + } + }, "node_modules/proxy-addr": { "version": "2.0.7", "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.7.tgz", @@ -566,6 +826,15 @@ "node": ">= 0.10" } }, + "node_modules/pump": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.3.tgz", + "integrity": "sha512-todwxLMY7/heScKmntwQG8CXVkWUOdYxIvY2s0VWAAMh/nd8SoYiRaKjlr7+iCs984f2P8zvrfWcDDYVb73NfA==", + "dependencies": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } + }, "node_modules/qs": { "version": "6.13.0", "resolved": "https://registry.npmjs.org/qs/-/qs-6.13.0.tgz", @@ -602,6 +871,33 @@ "node": ">= 0.8" } }, + "node_modules/rc": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/rc/-/rc-1.2.8.tgz", + "integrity": "sha512-y3bGgqKj3QBdxLbLkomlohkvsA8gdAiUQlSBJnBhfn+BPxg4bc62d8TcBW15wavDfgexCgccckhcZvywyQYPOw==", + "dependencies": { + "deep-extend": "^0.6.0", + "ini": "~1.3.0", + "minimist": "^1.2.0", + "strip-json-comments": "~2.0.1" + }, + "bin": { + "rc": "cli.js" + } + }, + "node_modules/readable-stream": { + "version": "3.6.2", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz", + "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==", + "dependencies": { + "inherits": "^2.0.3", + "string_decoder": "^1.1.1", + "util-deprecate": "^1.0.1" + }, + "engines": { + "node": ">= 6" + } + }, "node_modules/safe-buffer": { "version": "5.2.1", "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", @@ -626,6 +922,17 @@ "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" }, + "node_modules/semver": { + "version": "7.7.2", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.2.tgz", + "integrity": "sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA==", + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, "node_modules/send": { "version": "0.19.0", "resolved": "https://registry.npmjs.org/send/-/send-0.19.0.tgz", @@ -749,6 +1056,49 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/simple-concat": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/simple-concat/-/simple-concat-1.0.1.tgz", + "integrity": "sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ] + }, + "node_modules/simple-get": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/simple-get/-/simple-get-4.0.1.tgz", + "integrity": "sha512-brv7p5WgH0jmQJr1ZDDfKDOSeWWg+OVypG99A/5vYGPqJ6pxiaHLy8nxtFjBA7oMa01ebA9gfh1uMCFqOuXxvA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "dependencies": { + "decompress-response": "^6.0.0", + "once": "^1.3.1", + "simple-concat": "^1.0.0" + } + }, "node_modules/statuses": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.1.tgz", @@ -757,6 +1107,48 @@ "node": ">= 0.8" } }, + "node_modules/string_decoder": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz", + "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==", + "dependencies": { + "safe-buffer": "~5.2.0" + } + }, + "node_modules/strip-json-comments": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-2.0.1.tgz", + "integrity": "sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/tar-fs": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.3.tgz", + "integrity": "sha512-090nwYJDmlhwFwEW3QQl+vaNnxsO2yVsd45eTKRBzSzu+hlb1w2K9inVq5b0ngXuLVqQ4ApvsUHHnu/zQNkWAg==", + "dependencies": { + "chownr": "^1.1.1", + "mkdirp-classic": "^0.5.2", + "pump": "^3.0.0", + "tar-stream": "^2.1.4" + } + }, + "node_modules/tar-stream": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz", + "integrity": "sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==", + "dependencies": { + "bl": "^4.0.3", + "end-of-stream": "^1.4.1", + "fs-constants": "^1.0.0", + "inherits": "^2.0.3", + "readable-stream": "^3.1.1" + }, + "engines": { + "node": ">=6" + } + }, "node_modules/toidentifier": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.1.tgz", @@ -765,6 +1157,17 @@ "node": ">=0.6" } }, + "node_modules/tunnel-agent": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz", + "integrity": "sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==", + "dependencies": { + "safe-buffer": "^5.0.1" + }, + "engines": { + "node": "*" + } + }, "node_modules/type-is": { "version": "1.6.18", "resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz", @@ -785,6 +1188,11 @@ "node": ">= 0.8" } }, + "node_modules/util-deprecate": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", + "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==" + }, "node_modules/utils-merge": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz", @@ -800,7 +1208,11 @@ "engines": { "node": ">= 0.8" } + }, + "node_modules/wrappy": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", + "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==" } } } - diff --git a/package.json b/package.json index da44098..a69fb55 100644 --- a/package.json +++ b/package.json @@ -7,8 +7,9 @@ "start": "node server.js" }, "dependencies": { + "better-sqlite3": "^9.0.0", "express": "^4.18.2", + "express-rate-limit": "^8.1.0", "playwright": "^1.42.0" } } - diff --git a/server.js b/server.js index f6ef7fe..bdd242b 100644 --- a/server.js +++ b/server.js @@ -1,56 +1,518 @@ +// server.js const express = require('express'); const { chromium } = require('playwright'); +const Database = require('better-sqlite3'); +// Убираем punycode; используем WHATWG URL + domainToASCII +const { URL, domainToASCII } = require('node:url'); const app = express(); -const port = process.env.PORT || 3000; + +// ---------- Config ---------- +const PORT = Number(process.env.PORT || 3000); +const CHROMIUM_PATH = process.env.CHROMIUM_PATH || undefined; +const CACHE_TTL_SECONDS = parseInt(process.env.CACHE_TTL_SECONDS || '21600', 10); +const MAX_REDIRECT_STEPS = parseInt(process.env.MAX_REDIRECT_STEPS || '20', 10); +const NAV_TIMEOUT_MS = parseInt(process.env.NAV_TIMEOUT_MS || '30000', 10); +const QUIET_WINDOW_MS = parseInt(process.env.QUIET_WINDOW_MS || '700', 10); +const PRECHECK_MAX_REDIRECTS = parseInt(process.env.PRECHECK_MAX_REDIRECTS || '15', 10); +const SQLITE_PATH = process.env.SQLITE_PATH || './cache.db'; +const DEBUG_ENABLED = String(process.env.DEBUG || '').trim() === '1'; +const CHROMIUM_ARGS = [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-gpu', + '--no-zygote', +]; + +// ---------- Logging ---------- +const log = { + info: (...a) => console.log(...a), + debug: (...a) => { if (DEBUG_ENABLED) console.log(...a); }, + warn: (...a) => console.warn(...a), + error: (...a) => console.error(...a), +}; + +// ---------- DB ---------- +log.info(`[BOOT] SQLite path: ${SQLITE_PATH}`); +const db = new Database(SQLITE_PATH); +db.pragma('journal_mode = WAL'); +db.exec(` +CREATE TABLE IF NOT EXISTS domain_cache ( + domain TEXT PRIMARY KEY, + result_json TEXT NOT NULL, + final_url TEXT, + redirect_chain_json TEXT, + updated_at INTEGER NOT NULL, + ttl_at INTEGER NOT NULL +); +`); +const stmtSelect = db.prepare(` + SELECT result_json, final_url, redirect_chain_json, updated_at, ttl_at + FROM domain_cache WHERE domain = ? +`); +const stmtUpsert = db.prepare(` +INSERT INTO domain_cache (domain, result_json, final_url, redirect_chain_json, updated_at, ttl_at) +VALUES (@domain, @result_json, @final_url, @redirect_chain_json, @updated_at, @ttl_at) +ON CONFLICT(domain) DO UPDATE SET + result_json = excluded.result_json, + final_url = excluded.final_url, + redirect_chain_json = excluded.redirect_chain_json, + updated_at = excluded.updated_at, + ttl_at = excluded.ttl_at +`); app.use(express.json()); -function extractDomain(url) { +// ---------- Helpers ---------- +function normalizeDomain(input) { + if (!input || typeof input !== 'string') return null; + const s = input.trim().toLowerCase(); try { - return new URL(url).hostname; + // Если это URL, берём hostname; иначе считаем, что это просто хост + const asUrl = /^https?:\/\//i.test(s) ? s : `https://${s}`; + const u = new URL(asUrl); + // Преобразуем к IDNA ASCII (Punycode) через WHATWG util + const ascii = domainToASCII(u.hostname || ''); + return ascii || null; } catch { - return null; + // Попытка прямой IDNA-конверсии из строки (на случай голого хоста без схемы) + try { + const ascii = domainToASCII(s); + return ascii || null; + } catch { + return null; + } + } +} // WHATWG URL + url.domainToASCII [web:167][web:161][web:164] + +function extractDomain(url) { + try { return new URL(url).hostname.toLowerCase(); } catch { return null; } +} + +// эвристика «выглядит как файл» +function looksLikeFilePath(u) { + try { + const { pathname } = new URL(u); + return /\.(?:zip|pdf|png|jpe?g|gif|webp|svg|mp4|mp3|wav|csv|xlsx?|docx?|pptx?|exe|deb|rpm|apk|tar(?:\.gz)?|7z|gz|bz2)$/i.test(pathname); + } catch { return false; } +} + +// канонизация URL для детекции петель +function normalizeUrlForLoop(u) { + try { + const x = new URL(u); + x.hash = ''; + return x.toString(); + } catch { return u; } +} + +// ---------- Precheck: manual redirects & classification ---------- +async function precheckFollowManually(startUrl) { + let url = startUrl; + const visited = new Set(); + let sawHtmlHint = false; + for (let i = 0; i < PRECHECK_MAX_REDIRECTS; i++) { + const norm = normalizeUrlForLoop(url); + if (visited.has(norm)) { + log.debug(`[PRECHECK] Loop at ${norm}`); + return { skip: true, reason: 'redirect-loop', tryBrowser: sawHtmlHint }; + } + visited.add(norm); + let res; + try { + res = await fetch(url, { method: 'GET', redirect: 'manual' }); + } catch (e) { + log.debug(`[PRECHECK] GET(manual) failed for ${url}: ${e?.message}`); + return { skip: false, reason: null, tryBrowser: false }; + } + const status = res.status; + const ct = res.headers.get('content-type') || ''; + const cd = res.headers.get('content-disposition') || ''; + const loc = res.headers.get('location') || ''; + log.debug(`[PRECHECK] step=${i} status=${status} ct="${ct}" cd="${cd || '-'}" loc="${loc || '-'}"`); + const isHtml = /\btext\/html\b/i.test(ct); + if (isHtml) sawHtmlHint = true; + const isAttachment = /attachment/i.test(cd); + if (status === 403) { + return { skip: true, reason: 'forbidden', tryBrowser: true }; + } + if (status >= 300 && status < 400 && loc) { + const next = new URL(loc, url).toString(); + if (looksLikeFilePath(next) || /download|file|export/i.test(next)) { + return { skip: true, reason: `redirect-to-file(${next})`, tryBrowser: false, finalUrl: next }; + } + try { + const probe = await fetch(next, { method: 'GET', redirect: 'manual' }); + const pct = probe.headers.get('content-type') || ''; + const isHtmlTarget = /\btext\/html\b/i.test(pct); + if (isHtmlTarget) { + return { skip: true, reason: `marketing-redirect(${next})`, tryBrowser: false, finalUrl: next }; + } + } catch {} + url = next; + continue; + } + if (isAttachment) return { skip: true, reason: 'attachment', tryBrowser: false, finalUrl: url }; + if (!isHtml && ct) return { skip: true, reason: `non-HTML (${ct})`, tryBrowser: false, finalUrl: url }; + return { skip: false, reason: null, tryBrowser: false, finalUrl: url }; + } + log.debug(`[PRECHECK] Too many redirects >= ${PRECHECK_MAX_REDIRECTS}`); + return { skip: true, reason: `redirect-loop(${PRECHECK_MAX_REDIRECTS})`, tryBrowser: sawHtmlHint, finalUrl: null }; +} // [web:167] + +// ---------- Browser lifecycle ---------- +let browser; +async function ensureBrowser() { + if (browser && browser.isConnected()) return browser; + if (browser) { try { await browser.close(); } catch {} } + log.info(`[BROWSER] Launch headless Chromium`); + browser = await chromium.launch({ executablePath: CHROMIUM_PATH, headless: true, args: CHROMIUM_ARGS }); + return browser; +} // [web:151] + +// ---------- Redirect chain builder (document-only) ---------- +function buildRedirectChainForResponse(resp, maxLen = 50) { + const chain = []; + const req = resp.request(); + if (req.resourceType() !== 'document') return chain; + let prev = req.redirectedFrom(); + let toUrl = req.url(); + const status = resp.status(); + while (prev) { + chain.push({ from: prev.url(), to: toUrl, status }); + toUrl = prev.url(); + prev = prev.redirectedFrom(); + if (chain.length >= maxLen) break; + } + return chain.reverse(); +} // [web:151] + +// ---------- Quiet network window ---------- +async function quietWindowWait({ inflightRef, lastChangeRef, timeoutMs, quietMs }) { + const start = Date.now(); + while (Date.now() - start < timeoutMs) { + const quietFor = Date.now() - lastChangeRef.value; + if (inflightRef.value === 0 && quietFor >= quietMs) return; + await new Promise(r => setTimeout(r, 100)); + } +} // [web:151] + +// ---------- Core scan with Playwright ---------- +async function scanWithBrowser(originDomain, startUrl, contextOpts = {}) { + const b = await ensureBrowser(); + const context = await b.newContext({ acceptDownloads: true, ...contextOpts }); + + // Безопасный лимитер редиректов для документной навигации + await context.route('**', async route => { + const request = route.request(); + const isDoc = request.resourceType() === 'document'; + const isNav = request.isNavigationRequest(); + if (!(isDoc && isNav)) return route.continue(); + try { + const resp = await route.fetch({ maxRedirects: MAX_REDIRECT_STEPS }); + const status = resp.status(); + const headers = await resp.headers(); + const body = await resp.body().catch(() => null); + try { + await route.fulfill({ status, headers, body }); + } catch (e) { + log.debug(`[ROUTE] fulfill failed for ${request.url()}: ${e?.message || e}`); + await route.continue(); + } + } catch (e) { + const msg = String(e?.message || ''); + if (/redirect/i.test(msg) || /too many/i.test(msg)) { + try { + await route.fulfill({ + status: 508, + contentType: 'text/plain', + body: 'Loop Detected: too many redirects' + }); + } catch (e2) { + log.debug(`[ROUTE] fulfill(508) failed for ${request.url()}: ${e2?.message || e2}`); + await route.continue(); + } + } else { + log.debug(`[ROUTE] fetch failed for ${request.url()}: ${msg}`); + await route.continue(); + } + } + }); + + const page = await context.newPage(); + + const seenDomains = new Set(); + const redirectLog = []; + const visitedUrls = new Set(); + const inflightRef = { value: 0 }; + const lastChangeRef = { value: Date.now() }; + + if (DEBUG_ENABLED) { + page.on('console', msg => log.debug(`[PAGE.CONSOLE] ${msg.type()}: ${msg.text()}`)); + page.on('pageerror', err => log.debug(`[PAGE.ERROR] ${err?.message}`)); + page.on('requestfailed', req => log.debug(`[REQ.FAIL] ${req.url()} reason=${req.failure()?.errorText}`)); + } + + page.on('download', async dl => { + try { await dl.failure().catch(() => {}); } catch {} + log.debug(`[SCAN] Download ignored: ${dl.url()}`); + }); + + const onReq = req => { + inflightRef.value++; + lastChangeRef.value = Date.now(); + const d = extractDomain(req.url()); + if (d) seenDomains.add(d); + log.debug(`[REQ] ${req.method()} ${req.url()}`); + }; + const onResp = resp => { + inflightRef.value = Math.max(0, inflightRef.value - 1); + lastChangeRef.value = Date.now(); + const d = extractDomain(resp.url()); + if (d) seenDomains.add(d); + const status = resp.status(); + log.debug(`[RESP] ${status} ${resp.url()}`); + if (status >= 300 && status < 400 && resp.request().resourceType() === 'document') { + const piece = buildRedirectChainForResponse(resp, MAX_REDIRECT_STEPS + 5); + redirectLog.push(...piece); + } + }; + page.on('request', onReq); + page.on('response', onResp); + + try { + log.info(`[SCAN] goto(${startUrl}) domcontentloaded timeout=${NAV_TIMEOUT_MS}`); + let response; + try { + response = await page.goto(startUrl, { waitUntil: 'domcontentloaded', timeout: NAV_TIMEOUT_MS }); + } catch (e) { + const msg = String(e?.message || ''); + if (/Download is starting/i.test(msg)) { + log.info(`[SCAN] goto triggered download; continue as non-HTML`); + } else { + throw e; + } + } + + if (response && response.status && response.status() === 508) { + throw new Error(`Too many redirects (${MAX_REDIRECT_STEPS})`); + } + + await quietWindowWait({ inflightRef, lastChangeRef, timeoutMs: NAV_TIMEOUT_MS, quietMs: QUIET_WINDOW_MS }); + const finalUrl = page.url(); + + if (visitedUrls.has(finalUrl)) throw new Error('Redirect loop detected'); + visitedUrls.add(finalUrl); + + const steps = redirectLog.length; + if (steps > MAX_REDIRECT_STEPS) throw new Error(`Too many redirects (${steps})`); + + await context.close(); + + const relatedDomains = Array.from(seenDomains) + .filter(d => !d.includes('doubleclick') && !d.includes('google')) + .sort(); + + log.info(`[SCAN] Done finalUrl=${finalUrl} domains=${relatedDomains.length} redirects=${steps}`); + return { finalUrl, relatedDomains, redirectChain: redirectLog }; + } catch (e) { + try { await context.close(); } catch {} + try { + if (browser && typeof browser.isConnected === 'function' && !browser.isConnected()) { + await browser.close(); browser = null; + } + } catch {} + log.error(`[SCAN] Error: ${e?.message}`); + throw e; + } finally { + page.off('request', onReq); + page.off('response', onResp); } } +// ---------- High-level scan with precheck and escalation ---------- +async function scanDomainOnce(originDomain) { + const startUrl = `https://${originDomain}`; + log.info(`[SCAN] Start domain="${originDomain}" url=${startUrl}`); + const pre = await precheckFollowManually(startUrl); + + if (pre.skip && (pre.reason === 'attachment' || (pre.reason || '').startsWith('non-HTML'))) { + log.info(`[SCAN] Skip non-HTML/attachment: ${pre.reason}`); + return { finalUrl: pre.finalUrl || startUrl, relatedDomains: [originDomain], redirectChain: [], precheck: pre.reason }; + } + + let targetUrl = startUrl; + + if (pre.skip && /^marketing-redirect/.test(pre.reason || '') && pre.finalUrl) { + log.info(`[SCAN] Marketing redirect -> follow target in browser: ${pre.finalUrl}`); + targetUrl = pre.finalUrl; + } else if (pre.skip && pre.tryBrowser) { + log.info(`[SCAN] Escalation to browser due to ${pre.reason}`); + } + + const contextOpts = { + userAgent: + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + locale: 'en-US', + timezoneId: 'UTC', + }; + + try { + const result = await scanWithBrowser(originDomain, targetUrl, contextOpts); + if (!result.relatedDomains.includes(originDomain)) { + result.relatedDomains.unshift(originDomain); + } + return result; + } catch (e) { + log.warn(`[SCAN] Browser escalation failed: ${e?.message}`); + return { finalUrl: targetUrl, relatedDomains: [originDomain], redirectChain: [], precheck: pre.reason || 'blocked' }; + } +} + +// ---------- Cache helpers ---------- +function getFromCache(domain) { + const row = stmtSelect.get(domain); + if (!row) return null; + const now = Math.floor(Date.now() / 1000); + if (row.ttl_at > now) { + try { + const out = { + relatedDomains: JSON.parse(row.result_json), + finalUrl: row.final_url || null, + redirectChain: row.redirect_chain_json ? JSON.parse(row.redirect_chain_json) : [], + cached: true, + cachedAt: row.updated_at, + ttlAt: row.ttl_at, + }; + return out; + } catch (e) { + log.warn(`[CACHE] Parse error: ${e?.message}`); + return null; + } + } + return null; +} +function putToCache(domain, result) { + const now = Math.floor(Date.now() / 1000); + const ttlAt = now + CACHE_TTL_SECONDS; + try { + stmtUpsert.run({ + domain, + result_json: JSON.stringify(result.relatedDomains || []), + final_url: result.finalUrl || null, + redirect_chain_json: JSON.stringify(result.redirectChain || []), + updated_at: now, + ttl_at: ttlAt, + }); + log.info(`[CACHE] Upsert ${domain} ttlAt=${ttlAt}`); + } catch (e) { + log.warn(`[CACHE] Upsert error: ${e?.message}`); + } +} + +// ---------- Routes ---------- app.get('/domains', async (req, res) => { - const { domain } = req.query; + res.type('application/json'); + const raw = req.query.domain; + log.info(`[HTTP] /domains?domain=${raw}`); + const domain = normalizeDomain(raw); if (!domain) { - res.status(400).json({ error: '"domain" query parameter is required' }); + res.status(400).json({ error: '"domain" must be a valid hostname', code: 'BAD_DOMAIN' }); return; } - const url = `https://${domain}`; - const seenDomains = new Set(); + const HARD_TIMEOUT = parseInt(process.env.HARD_TIMEOUT_MS || '70000', 10); + const hardTimer = setTimeout(() => { + try { if (!res.headersSent) res.status(504).json({ error: 'Gateway Timeout', code: 'TIMEOUT' }); } catch {} + }, HARD_TIMEOUT); try { - const browser = await chromium.launch({ - args: ['--no-sandbox', '--disable-setuid-sandbox'] + const cached = getFromCache(domain); + if (cached) { + log.info(`[HTTP] Cache HIT ${domain}`); + res.status(200).json({ + domain, + finalUrl: cached.finalUrl, + relatedDomains: cached.relatedDomains, + redirectChain: cached.redirectChain, + cached: true, + cachedAt: cached.cachedAt, + ttlAt: cached.ttlAt, + status: 'ok' + }); + return; + } + + const result = await scanDomainOnce(domain); + + if (result.precheck) { + if ((result.precheck || '').startsWith('marketing-redirect')) { + res.status(200).json({ + domain, + finalUrl: result.finalUrl || `https://${domain}`, + relatedDomains: [domain], + redirectChain: [], + cached: false, + status: 'ok', + note: result.precheck + }); + return; + } + res.status(200).json({ + domain, + finalUrl: result.finalUrl || `https://${domain}`, + relatedDomains: [domain], + redirectChain: [], + cached: false, + status: (result.precheck === 'forbidden' || result.precheck === 'blocked') ? 'blocked' : 'skipped', + reason: result.precheck + }); + return; + } + + putToCache(domain, result); + res.status(200).json({ + domain, + finalUrl: result.finalUrl, + relatedDomains: result.relatedDomains, + redirectChain: result.redirectChain, + cached: false, + status: 'ok' }); - const context = await browser.newContext(); - const page = await context.newPage(); - - page.on('request', request => { - const d = extractDomain(request.url()); - if (d) seenDomains.add(d); - }); - - await page.goto(url, { waitUntil: 'load', timeout: 30000 }); - await browser.close(); - - // Фильтрация доменов - const filteredDomains = Array.from(seenDomains).filter(d => - !d.includes('doubleclick') && !d.includes('google') - ).sort(); - - res.json({ domains: filteredDomains }); } catch (e) { - res.status(500).json({ error: e.message || 'Internal server error' }); + const msg = String(e?.message || 'Internal error'); + log.error(`[HTTP] Error for ${domain}: ${msg}`); + const forbidden = /403|forbidden|blocked/i.test(msg); + res.status(forbidden ? 403 : 500).json({ + error: forbidden ? 'Forbidden' : 'Internal server error', + code: forbidden ? 'FORBIDDEN' : 'INTERNAL', + details: msg + }); + } finally { + clearTimeout(hardTimer); } }); -app.listen(port, () => { - console.log(`Domain scanner service listening on port ${port}`); +app.get('/health', (_req, res) => { + res.type('application/json'); + res.json({ ok: true }); +}); + +// ---------- Signals ---------- +process.on('SIGTERM', async () => { + log.info('[SIGNAL] SIGTERM'); + try { if (browser) await browser.close(); } catch {} + process.exit(0); +}); +process.on('SIGINT', async () => { + log.info('[SIGNAL] SIGINT'); + try { if (browser) await browser.close(); } catch {} + process.exit(0); +}); + +// ---------- Start ---------- +app.listen(PORT, () => { + log.info(`Domain scanner service listening on port ${PORT}`); });