﻿/* seed-revelys.js */
import "dotenv/config";

import fs from "node:fs";
import path from "node:path";
import crypto from "node:crypto";
import readline from "node:readline";
import { Readable } from "node:stream";
import { fileURLToPath } from "node:url";

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);

function interopDefault(mod) {
  return mod?.default ?? mod;
}

function interopNamed(mod, name) {
  return mod?.[name] ?? mod?.default?.[name];
}

const slugify = interopDefault(await import("slugify"));
const pLimit = interopDefault(await import("p-limit"));
const cheerio = interopDefault(await import("cheerio"));

const ddgMod = await import("duck-duck-scrape");
const ddgSearchLib = interopNamed(ddgMod, "search");
let DDG_COMMON_HEADERS = null;
try {
  const ddgUtil = await import("duck-duck-scrape/lib/util.js");
  DDG_COMMON_HEADERS = interopNamed(ddgUtil, "COMMON_HEADERS") || null;
} catch {
  // ignore (best-effort)
}

const probe = interopDefault(await import("probe-image-size"));
const sharp = interopDefault(await import("sharp"));

const supabaseMod = await import("@supabase/supabase-js");
const createClient = interopNamed(supabaseMod, "createClient");

if (typeof ddgSearchLib !== "function") {
  throw new Error("duck-duck-scrape: search() not found. Check dependency version.");
}

if (typeof createClient !== "function") {
  throw new Error("@supabase/supabase-js: createClient() not found. Check dependency version.");
}

// -------------------- CONFIG --------------------
const MARKET = process.env.MARKET || "BE-WAL";

const TARGET_PER_PAIR = Number(process.env.TARGET_PER_PAIR || 10);
const MAX_SITES_TO_TRY = Number(process.env.MAX_SITES_TO_TRY || 40);
const MAX_PAGES_PER_SITE = Number(process.env.MAX_PAGES_PER_SITE || 35);
const MAX_CORPUS_CHARS = Number(process.env.MAX_CORPUS_CHARS || 24_000);

const UPLOAD_IMAGES = String(process.env.UPLOAD_IMAGES || "0") === "1";
const VERIFY_BCE_WITH_KBO = String(process.env.VERIFY_BCE_WITH_KBO || "0") === "1";
const DRY_RUN = String(process.env.DRY_RUN || "0") === "1";
const MIN_GALLERY_IMAGES = Number(process.env.MIN_GALLERY_IMAGES || 4);
const RECOVER_EMAIL = String(process.env.RECOVER_EMAIL || "1") === "1";
const RECOVER_IMAGES = String(process.env.RECOVER_IMAGES || "0") === "1";
const FB_APP_ID = process.env.FB_APP_ID || "";
const FB_APP_SECRET = process.env.FB_APP_SECRET || "";
const RECOVER_FOUNDER = String(process.env.RECOVER_FOUNDER || "1") === "1";
const RECOVER_OPENING_HOURS = String(process.env.RECOVER_OPENING_HOURS || "1") === "1";
const RECOVER_FOUNDER_EXTERNAL = String(process.env.RECOVER_FOUNDER_EXTERNAL || "0") === "1";
const TAGS_LOWERCASE = String(process.env.TAGS_LOWERCASE || "1") === "1";
const STRICT_LLM_DESCRIPTION = String(process.env.STRICT_LLM_DESCRIPTION || "1") === "1";
const STRICT_PUBLISH_QUALITY_GATE = String(process.env.STRICT_PUBLISH_QUALITY_GATE || "1") === "1";
const LLM_MAX_ATTEMPTS = Math.max(1, Number(process.env.LLM_MAX_ATTEMPTS || 5));
const LLM_RETRY_DELAY_MS = Math.max(0, Number(process.env.LLM_RETRY_DELAY_MS || 1200));
const MAX_GALLERY_TO_STORE = 4;
const USED_DOMAINS_SCOPE = (process.env.USED_DOMAINS_SCOPE || "global").toLowerCase();
const STRICT_CITY_MATCH = String(process.env.STRICT_CITY_MATCH || "1") === "1";
const LOCATION_MATCH_MODE = (process.env.LOCATION_MATCH_MODE || "city").toLowerCase(); // off | city | province | region
const RESUME = String(process.env.RESUME || "1") === "1";
const OUT_DIR_NAME = process.env.OUT_DIR || "out";
const PRIORITY_FROM_LOG = String(process.env.PRIORITY_FROM_LOG || "1") === "1";
const PRIORITY_LOG_PATHS = (() => {
  const raw = String(process.env.PRIORITY_LOG_PATH || "");
  const parsed = raw
    .split(/[;,]/)
    .map((p) => p.trim())
    .filter(Boolean)
    .map((p) => path.resolve(p));
  if (parsed.length) return Array.from(new Set(parsed));
  return [path.join(__dirname, "out", "run_log.ndjson")];
})();
const USE_LOCAL_BCE_DATA = String(process.env.USE_LOCAL_BCE_DATA || "1") === "1";
const STRICT_BCE_LOCAL_MATCH = String(process.env.STRICT_BCE_LOCAL_MATCH || "1") === "1";
const BCE_LOAD_CONTACT = String(process.env.BCE_LOAD_CONTACT || "1") === "1";
const BCE_LANGUAGE = (process.env.BCE_LANGUAGE || "FR").toUpperCase();
const BCE_DIR = process.env.BCE_DIR ? path.resolve(process.env.BCE_DIR) : path.join(__dirname, "bce");

const CITIES_LIMIT = Number(process.env.CITIES_LIMIT || 0);
const INDUSTRIES_LIMIT = Number(process.env.INDUSTRIES_LIMIT || 0);

const FETCH_TIMEOUT_MS = Number(process.env.FETCH_TIMEOUT_MS || 12_000);
const OLLAMA_TIMEOUT_MS = Number(process.env.OLLAMA_TIMEOUT_MS || 120_000);
const GEOCODE_LOCATION = String(process.env.GEOCODE_LOCATION || "1") === "1";
const GEOCODE_MIN_DELAY_MS = Math.max(350, Number(process.env.GEOCODE_MIN_DELAY_MS || 1100));
const GEOCODE_TIMEOUT_MS = Math.max(3000, Number(process.env.GEOCODE_TIMEOUT_MS || 12000));

const LOG_LEVEL = (process.env.LOG_LEVEL || "info").toLowerCase(); // silent | error | warn | info | debug
const LOG_LEVEL_RANK = (() => {
  const map = { silent: 0, error: 1, warn: 2, info: 3, debug: 4 };
  return map[LOG_LEVEL] ?? 3;
})();

const OLLAMA_URL = process.env.OLLAMA_URL || "http://127.0.0.1:11434";
const OLLAMA_MODEL = process.env.OLLAMA_MODEL || "mistral-small3.2:24b-instruct-q5_k_m";
const OLLAMA_NUM_CTX = Math.max(2048, Number(process.env.OLLAMA_NUM_CTX || 8192));
const VISION_ENABLED = process.env.VISION_ENABLED === "1";
const VISION_MODEL = process.env.VISION_MODEL || "qwen2-vl:7b";
const VISION_TOP_N = Math.max(5, Number(process.env.VISION_TOP_N || 15));
const OLLAMA_TEMPERATURE = Number.isFinite(Number(process.env.OLLAMA_TEMPERATURE || 0.15))
  ? Number(process.env.OLLAMA_TEMPERATURE || 0.15)
  : 0.15;
const LLM_RETRY_TEMPERATURES = (() => {
  const raw = String(process.env.LLM_RETRY_TEMPERATURES || "0.15,0.25,0.35,0.50,0.60");
  const parsed = raw
    .split(",")
    .map((x) => Number(String(x).trim()))
    .filter((x) => Number.isFinite(x) && x >= 0 && x <= 1.2);
  return parsed.length ? parsed : [OLLAMA_TEMPERATURE];
})();
const MIN_ONE_LINER_LENGTH = Math.max(80, Number(process.env.MIN_ONE_LINER_LENGTH || 120));

const SUPABASE_URL = process.env.SUPABASE_URL || "";
const SUPABASE_SERVICE_ROLE_KEY = process.env.SUPABASE_SERVICE_ROLE_KEY || "";
const SUPABASE_BUCKET = process.env.SUPABASE_BUCKET || "company-assets";
const SUPABASE_HOST = (() => {
  try {
    return SUPABASE_URL ? new URL(SUPABASE_URL).hostname.toLowerCase() : null;
  } catch {
    return null;
  }
})();

const USER_AGENT = "Mozilla/5.0 (compatible; RevelysSeedBot/1.3.2; +contact@revelys.be)";
const BROWSER_USER_AGENT =
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";

// SEO pack (generated via the local LLM)
const GENERATE_SEO = String(process.env.GENERATE_SEO || "1") === "1";
const SEO_VERSION = Number(process.env.SEO_VERSION || 5);
const PUBLIC_COMPANY_URL_BASE = (process.env.PUBLIC_COMPANY_URL_BASE || "https://revelys.be/entreprises").replace(/\/+$/, "");
const APPEND_PUBLIC_REFRESH_SQL = String(process.env.APPEND_PUBLIC_REFRESH_SQL || "1") === "1";

// Search tuning
const SEARCH_PROVIDER = (process.env.SEARCH_PROVIDER || "ddg").toLowerCase();
const SEARCH_TERMS_MAX = Math.max(1, Number(process.env.SEARCH_TERMS_MAX || 4));

// Serper (Google) API tuning (used when SEARCH_PROVIDER=serper)
const SERPER_API_KEY = process.env.SERPER_API_KEY || process.env.SERPER_KEY || "";
const SERPER_ENDPOINT = process.env.SERPER_ENDPOINT || "https://google.serper.dev/search";
const SERPER_GL = (process.env.SERPER_GL || "be").toLowerCase(); // geo (ex: be, fr, us)
const SERPER_HL = (process.env.SERPER_HL || "fr").toLowerCase(); // language (ex: fr, en)
const SERPER_COUNT = Math.max(1, Math.min(20, Number(process.env.SERPER_COUNT || 10)));
const SERPER_MIN_DELAY_MS = Number(process.env.SERPER_MIN_DELAY_MS || 250);
const SERPER_MAX_RETRIES = Number(process.env.SERPER_MAX_RETRIES || 5);
const SERPER_BACKOFF_BASE_MS = Number(process.env.SERPER_BACKOFF_BASE_MS || 1500);
const SERPER_BACKOFF_MAX_MS = Number(process.env.SERPER_BACKOFF_MAX_MS || 30000);
const SERPER_TIMEOUT_MS = Number(process.env.SERPER_TIMEOUT_MS || 25000);

// Brave Search API tuning (used when SEARCH_PROVIDER=brave)
const BRAVE_API_KEY = process.env.BRAVE_API_KEY || process.env.BRAVE_SEARCH_API_KEY || "";
const BRAVE_COUNTRY = (process.env.BRAVE_COUNTRY || "BE").toUpperCase();
const BRAVE_SEARCH_LANG = (process.env.BRAVE_SEARCH_LANG || "fr").toLowerCase();
const BRAVE_SAFESEARCH = (process.env.BRAVE_SAFESEARCH || "moderate").toLowerCase(); // off | moderate | strict
const BRAVE_COUNT = Math.max(1, Math.min(20, Number(process.env.BRAVE_COUNT || 10)));
const BRAVE_MIN_DELAY_MS = Number(process.env.BRAVE_MIN_DELAY_MS || 250);
const BRAVE_MAX_RETRIES = Number(process.env.BRAVE_MAX_RETRIES || 5);
const BRAVE_BACKOFF_BASE_MS = Number(process.env.BRAVE_BACKOFF_BASE_MS || 1500);
const BRAVE_BACKOFF_MAX_MS = Number(process.env.BRAVE_BACKOFF_MAX_MS || 30000);
const BRAVE_TIMEOUT_MS = Number(process.env.BRAVE_TIMEOUT_MS || 25000);

// DuckDuckGo (duck-duck-scrape) tuning.
// Note: duck-duck-scrape's `search()` does NOT apply COMMON_HEADERS on the `d.js` call unless we pass needleOptions.
const DDG_LOCALE = process.env.DDG_LOCALE || "be-fr";
const DDG_SAFESEARCH = Number(process.env.DDG_SAFESEARCH || -2); // -2=OFF, -1=MODERATE, 0=STRICT (duck-duck-scrape enum)
const DDG_MIN_DELAY_MS = Number(process.env.DDG_MIN_DELAY_MS || 1500);
const DDG_MAX_RETRIES = Number(process.env.DDG_MAX_RETRIES || 6);
const DDG_BACKOFF_BASE_MS = Number(process.env.DDG_BACKOFF_BASE_MS || 2000);
const DDG_BACKOFF_MAX_MS = Number(process.env.DDG_BACKOFF_MAX_MS || 60000);
const DDG_REQUEST_TIMEOUT_MS = Number(process.env.DDG_REQUEST_TIMEOUT_MS || 25000);
const DDG_NEEDLE_OPTIONS = {
  headers: {
    ...(DDG_COMMON_HEADERS || {}),
    // Keep this browser-like: DDG blocks "unknown" clients quickly.
    "user-agent":
      process.env.DDG_USER_AGENT ||
      "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
    "accept-language": process.env.DDG_ACCEPT_LANGUAGE || "fr-BE,fr;q=0.9,en;q=0.8",
  },
  open_timeout: DDG_REQUEST_TIMEOUT_MS,
  response_timeout: DDG_REQUEST_TIMEOUT_MS,
};

// ---- Load JSON configs (cities / industries / blacklist) ----
function loadJson(filePath, fallback) {
  try {
    return JSON.parse(fs.readFileSync(filePath, "utf-8"));
  } catch {
    return fallback;
  }
}

let CITIES = loadJson(path.join(__dirname, "config", "cities.json"), []);
let INDUSTRIES = loadJson(path.join(__dirname, "config", "industries.json"), []);
const BLACKLIST_DOMAINS = loadJson(path.join(__dirname, "config", "blacklist_domains.json"), []);
const CITY_PROVINCES = loadJson(path.join(__dirname, "config", "city_provinces.json"), {});
const CITY_HINT_TOKENS = (() => {
  const set = new Set();
  const addCityTokens = (raw) => {
    const norm = normalizeCityName(raw);
    if (!norm) return;
    for (const token of norm.split(" ").filter(Boolean)) {
      if (token.length >= 3) set.add(token);
    }
  };
  for (const city of CITIES || []) addCityTokens(city);
  for (const city of Object.keys(CITY_PROVINCES || {})) addCityTokens(city);
  return set;
})();
const NON_OFFICIAL_HOST_HINTS = [
  "pagesdor",
  "pagesjaunes",
  "infobel",
  "kompass",
  "trustup",
  "trustpilot",
  "ringtwice",
  "trouvetonresto",
  "tripadvisor",
  "starofservice",
  "allovoisins",
  "houzz",
  "yelp",
  "sortlist",
  "booking.com",
  "indeed",
  "stepstone",
  "jobijoba",
  "jooble",
  "optioncarriere",
  "vdab",
  "yoojo",
  "yoopies",
  "2ememain",
  "care.com",
  "tafsquare",
];
const NON_OFFICIAL_PATH_RE =
  /\/(?:annuaire|directory|listing|platform|plateforme|comparateur|guide|categorie|categories|category|entreprises?|professionnels?|prestataires?)(?:\/|$)/i;
const NON_OFFICIAL_TEXT_RE =
  /\b(?:annuaire|platforme|plateforme|platform|marketplace|comparateur|guide des|classement|top\s*\d+|devis en ligne)\b/i;
const NON_OFFICIAL_CONTEXT_RE = /\b(?:entreprises?|professionnels?|prestataires?|restaurants?|traiteurs?|artisans?)\b/i;
const NON_OFFICIAL_PUBLIC_HOST_RE = /(?:^|\.)(?:liege\.be|bruxelles\.be|wallonie\.be|belgium\.be|fgov\.be|gov\.be)$/i;
const NON_OFFICIAL_PUBLIC_STRONG_TEXT_RE =
  /\b(?:administration communale|service population|service urbanisme|hotel de ville|maison communale|vie communale|services communaux|etat civil|guichet communal)\b/i;
const NON_OFFICIAL_PUBLIC_WEAK_TEXT_RE = /\b(?:ville de|commune de|service public|cpas)\b/i;
const NON_OFFICIAL_COMMERCIAL_TEXT_RE =
  /\b(?:devis|tarif|intervention|urgence|depannage|plombier|chauffagiste|electricien|serrurier|vitrier|toiture|facade|isolation|entretien|installation|reparation)\b/i;

if (Number.isFinite(CITIES_LIMIT) && CITIES_LIMIT > 0) CITIES = CITIES.slice(0, CITIES_LIMIT);
if (Number.isFinite(INDUSTRIES_LIMIT) && INDUSTRIES_LIMIT > 0) INDUSTRIES = INDUSTRIES.slice(0, INDUSTRIES_LIMIT);

// -------------------- UTILS --------------------
function llmTemperatureForAttempt(attempt) {
  const idx = Math.max(0, Math.min(LLM_RETRY_TEMPERATURES.length - 1, Number(attempt || 1) - 1));
  const t = LLM_RETRY_TEMPERATURES[idx];
  return Number.isFinite(t) ? t : OLLAMA_TEMPERATURE;
}

function ensureDir(p) {
  if (!fs.existsSync(p)) fs.mkdirSync(p, { recursive: true });
}

function fileExists(p) {
  try {
    return fs.existsSync(p);
  } catch {
    return false;
  }
}

function sha256Hex(str) {
  return crypto.createHash("sha256").update(String(str || ""), "utf8").digest("hex");
}

function sha256Buffer(buf) {
  if (!buf) return null;
  return crypto.createHash("sha256").update(buf).digest("hex");
}

// Perceptual hash (average hash, 16x16 = 256 bits).
// Returns a binary string of 0s and 1s, or null on error.
async function computePerceptualHash(buf) {
  try {
    const { data } = await sharp(buf)
      .resize(16, 16, { fit: "fill" })
      .grayscale()
      .raw()
      .toBuffer({ resolveWithObject: true });
    const pixels = Array.from(data);
    const avg = pixels.reduce((s, v) => s + v, 0) / pixels.length;
    return pixels.map((v) => (v >= avg ? "1" : "0")).join("");
  } catch {
    return null;
  }
}

function perceptualHashDistance(h1, h2) {
  if (!h1 || !h2 || h1.length !== h2.length) return Infinity;
  let d = 0;
  for (let i = 0; i < h1.length; i++) if (h1[i] !== h2[i]) d++;
  return d;
}

// Max Hamming distance (out of 256 bits) to consider two images visually identical.
// 10 ≈ 4% difference — catches same image at slightly different compression/resize.
const PHASH_DISTANCE_THRESHOLD = Number(process.env.PHASH_DISTANCE_THRESHOLD || 10);

async function loadResumeFromLog(logPath, { usedBce, usedDomainsGlobal, usedDomainsByPair, pairCounts }) {
  if (!fileExists(logPath)) return { okCount: 0 };

  let okCount = 0;

  const rl = readline.createInterface({
    input: fs.createReadStream(logPath, { encoding: "utf-8" }),
    crlfDelay: Infinity,
  });

  for await (const line of rl) {
    const l = String(line || "").trim();
    if (!l) continue;
    let obj;
    try {
      obj = JSON.parse(l);
    } catch {
      continue;
    }

    if (obj?.status !== "ok") continue;

    okCount += 1;

    const industry = obj.industry ? String(obj.industry) : null;
    const city = obj.city ? String(obj.city) : null;
    const key = industry && city ? `${industry}||${city}` : null;

    const dom = obj.domain ? String(obj.domain) : domainOf(obj.url || "");
    if (dom) {
      usedDomainsGlobal.add(dom);
      if (key) {
        const set = usedDomainsByPair.get(key) || (usedDomainsByPair.set(key, new Set()), usedDomainsByPair.get(key));
        set.add(dom);
      }
    }

    if (obj.bce) usedBce.add(String(obj.bce));

    if (key) pairCounts.set(key, (pairCounts.get(key) || 0) + 1);
  }

  return { okCount };
}

async function loadPriorityCandidatesFromLog(logPath) {
  const byPair = new Map();
  const byNormPair = new Map();
  if (!logPath || !fileExists(logPath)) {
    return { loaded: false, path: logPath, byPair, byNormPair, totalOk: 0, kept: 0, skippedNonOfficial: 0 };
  }

  let totalOk = 0;
  let kept = 0;
  let skippedNonOfficial = 0;
  const seenDomainsByPair = new Map();

  const rl = readline.createInterface({
    input: fs.createReadStream(logPath, { encoding: "utf-8" }),
    crlfDelay: Infinity,
  });

  for await (const line of rl) {
    const raw = String(line || "").trim();
    if (!raw) continue;
    let obj;
    try {
      obj = JSON.parse(raw);
    } catch {
      continue;
    }

    if (obj?.status !== "ok") continue;
    totalOk += 1;

    const industry = normSpaces(obj.industry || "");
    const city = normSpaces(obj.city || "");
    const url = safeUrl(obj.url || "");
    if (!industry || !city || !url) continue;

    const nonOfficialReason = nonOfficialSiteReason({
      url,
      title: obj.title || "",
      snippet: obj.snippet || "",
      siteName: obj.name || "",
      corpus: "",
      targetCity: city,
    });
    if (nonOfficialReason) {
      skippedNonOfficial += 1;
      continue;
    }

    if (isBlacklisted(url)) {
      skippedNonOfficial += 1;
      continue;
    }

    const dom = domainOf(url);
    if (!dom) continue;

    const key = `${industry}||${city}`;
    const normKey = priorityPairKey(industry, city);
    const seen = seenDomainsByPair.get(key) || (seenDomainsByPair.set(key, new Set()), seenDomainsByPair.get(key));
    if (seen.has(dom)) continue;
    seen.add(dom);

    const entry = {
      title: normSpaces(obj.title || obj.name || dom),
      url,
      snippet: normSpaces(obj.snippet || ""),
      searchKeyword: obj.search_keyword ? normSpaces(obj.search_keyword) : null,
      categoryIndustry:
        obj.industry_category ? normSpaces(obj.industry_category) : canonicalIndustryLabel(industry) || cleanIndustryKeyword(industry) || industry,
      sourceQuery: "priority_log",
      priority: true,
    };
    const list = byPair.get(key) || (byPair.set(key, []), byPair.get(key));
    list.push(entry);
    if (normKey) {
      const normList = byNormPair.get(normKey) || (byNormPair.set(normKey, []), byNormPair.get(normKey));
      normList.push(entry);
    }
    kept += 1;
  }

  return { loaded: true, path: logPath, byPair, byNormPair, totalOk, kept, skippedNonOfficial };
}

function mergePriorityCandidatesMap(targetMap, sourceMap) {
  if (!(targetMap instanceof Map) || !(sourceMap instanceof Map)) return;
  for (const [key, values] of sourceMap.entries()) {
    if (!Array.isArray(values) || !values.length) continue;
    const target = targetMap.get(key) || (targetMap.set(key, []), targetMap.get(key));
    target.push(...values);
  }
}

function priorityPairKey(industry, city) {
  const ind = normalizeCityName(canonicalIndustryLabel(industry) || cleanIndustryKeyword(industry) || industry);
  const c = normalizeCityName(city);
  if (!ind || !c) return null;
  return `${ind}||${c}`;
}

function loadUsedSlugsFromSql(sqlPath, usedSlugs) {
  if (!sqlPath || !fileExists(sqlPath) || !(usedSlugs instanceof Set)) return 0;
  const src = fs.readFileSync(sqlPath, "utf-8");
  const re = /SELECT\s+'([^']+)'\s*,\s*'[^']+'/g;
  let n = 0;
  let m;
  while ((m = re.exec(src)) !== null) {
    if (!m[1]) continue;
    usedSlugs.add(String(m[1]));
    n += 1;
  }
  return n;
}

function loadUsedBceFromSql(sqlPath, usedBce) {
  if (!sqlPath || !fileExists(sqlPath) || !(usedBce instanceof Set)) return 0;
  const src = fs.readFileSync(sqlPath, "utf-8");
  const re = /'BE(\d{10})'/gi;
  let n = 0;
  let m;
  while ((m = re.exec(src)) !== null) {
    if (!m[1]) continue;
    usedBce.add(String(m[1]));
    n += 1;
  }
  return n;
}

// Targeted replacement of known French UTF-8-as-Latin-1 mojibake sequences.
// This handles the common case where only some characters are mis-encoded (mixed
// encoding), which makes the full Buffer.from(latin1) approach fail (produces \uFFFD).
function repairFrenchMojibake(s) {
  return s
    // 2-byte UTF-8 sequences (C3 xx = U+00C0..U+00FF range — covers most French chars)
    .replace(/\u00C3\u00A0/g, "\u00E0") // à
    .replace(/\u00C3\u00A2/g, "\u00E2") // â
    .replace(/\u00C3\u00A7/g, "\u00E7") // ç
    .replace(/\u00C3\u00A8/g, "\u00E8") // è
    .replace(/\u00C3\u00A9/g, "\u00E9") // é
    .replace(/\u00C3\u00AA/g, "\u00EA") // ê
    .replace(/\u00C3\u00AB/g, "\u00EB") // ë
    .replace(/\u00C3\u00AE/g, "\u00EE") // î
    .replace(/\u00C3\u00AF/g, "\u00EF") // ï
    .replace(/\u00C3\u00B4/g, "\u00F4") // ô
    .replace(/\u00C3\u00B9/g, "\u00F9") // ù
    .replace(/\u00C3\u00BB/g, "\u00FB") // û
    .replace(/\u00C3\u00BC/g, "\u00FC") // ü
    .replace(/\u00C3\u0080/g, "\u00C0") // À
    .replace(/\u00C3\u0088/g, "\u00C8") // È
    .replace(/\u00C3\u0089/g, "\u00C9") // É
    .replace(/\u00C3\u008A/g, "\u00CA") // Ê
    .replace(/\u00C3\u008E/g, "\u00CE") // Î
    .replace(/\u00C3\u0094/g, "\u00D4") // Ô
    // C2 xx — covers Latin-1 supplement (non-breaking space, «, », ©, etc.)
    .replace(/\u00C2\u00A0/g, " ")      // NBSP → regular space
    .replace(/\u00C2\u00AB/g, "\u00AB") // «
    .replace(/\u00C2\u00BB/g, "\u00BB") // »
    .replace(/\u00C2\u00A9/g, "\u00A9") // ©
    // 3-byte UTF-8 sequences (E2 80 xx — smart quotes, dashes, ellipsis)
    .replace(/\u00E2\u0080\u0099/g, "\u2019") // RIGHT SINGLE QUOTATION MARK '
    .replace(/\u00E2\u0080\u0098/g, "\u2018") // LEFT SINGLE QUOTATION MARK '
    .replace(/\u00E2\u0080\u009C/g, "\u201C") // LEFT DOUBLE QUOTATION MARK "
    .replace(/\u00E2\u0080\u009D/g, "\u201D") // RIGHT DOUBLE QUOTATION MARK "
    .replace(/\u00E2\u0080\u0093/g, "\u2013") // EN DASH –
    .replace(/\u00E2\u0080\u0094/g, "\u2014") // EM DASH —
    .replace(/\u00E2\u0080\u00A6/g, "\u2026"); // ELLIPSIS …
}

function maybeRepairMojibakeUtf8(input) {
  const src = String(input || "");
  if (!src) return src;
  if (!/[\u00C3\u00C2\u00E2]/.test(src) && !/[ÃÂâ€]/.test(src)) return src;

  // Pass 1: targeted character-by-character replacement (safe for mixed encoding).
  const targeted = repairFrenchMojibake(src);
  const badScore = (s) => (String(s || "").match(/[\u00C3\u00C2\u00E2Ã]/g) || []).length;
  if (targeted !== src && badScore(targeted) < badScore(src)) {
    // Run a second pass on the targeted result in case of nested sequences
    return repairFrenchMojibake(targeted);
  }

  // Pass 2: try full-string Latin-1 → UTF-8 reinterpretation (works when encoding is uniform)
  if (!/[ÃÂâ€]/.test(src)) return targeted;
  let repaired = src;
  try {
    repaired = Buffer.from(src, "latin1").toString("utf8");
  } catch {
    return targeted !== src ? targeted : src;
  }
  if (!repaired || repaired.includes("\uFFFD")) return targeted !== src ? targeted : src;
  return badScore(repaired) < badScore(src) ? repaired : (targeted !== src ? targeted : src);
}

function normSpaces(s) {
  const decoded = decodeBasicHtmlEntities(String(s || ""));
  const repaired = maybeRepairMojibakeUtf8(decoded);
  return stripUnsafeUnicodeChars(repaired).replace(/\s+/g, " ").trim();
}

function stripUnsafeUnicodeChars(s) {
  if (s === null || s === undefined) return "";
  return String(s)
    // Keep CR/LF/TAB, drop other control chars (C0/C1).
    .replace(/[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F-\u009F]/g, "")
    // Drop Unicode private-use code points (often icon-font artifacts from scraping).
    .replace(/[\uE000-\uF8FF]/g, "")
    .replace(/[\u{F0000}-\u{FFFFD}\u{100000}-\u{10FFFD}]/gu, "");
}

function logInfo(msg) {
  if (LOG_LEVEL_RANK >= 3) console.log(msg);
}

function logDebug(msg) {
  if (LOG_LEVEL_RANK >= 4) console.log(msg);
}

const TAG_EMOTIONAL_OR_GENERIC = new Set([
  "serenite",
  "confiance",
  "securite",
  "tranquillite",
  "qualite",
  "professionnel",
  "professionnelle",
  "expertise",
  "service client",
  "proximite",
  "fiabilite",
  "reactivite",
  "accompagnement",
  "innovation",
]);

const TAG_JOINED_TOKENS = [
  "depannage",
  "debouchage",
  "fuite",
  "fuites",
  "chauffage",
  "chaudiere",
  "climatisation",
  "toiture",
  "serrurerie",
  "vitrerie",
  "menuiserie",
  "isolation",
  "facade",
  "carrelage",
  "peinture",
  "nettoyage",
  "jardinage",
  "ramonage",
  "cuisine",
  "panneaux",
  "solaire",
  "recharge",
  "borne",
  "maintenance",
  "urgence",
  "rapide",
  "technique",
  "durable",
];

function splitLikelyJoinedTag(t) {
  const raw = String(t || "");
  if (!raw || /\s/.test(raw) || raw.length < 12) return raw;
  const s = raw.toLowerCase();

  for (const token of TAG_JOINED_TOKENS.sort((a, b) => b.length - a.length)) {
    if (s.startsWith(token) && s.length > token.length + 3) {
      const tail = s.slice(token.length);
      return `${token} ${tail}`.trim();
    }
    if (s.endsWith(token) && s.length > token.length + 3) {
      const head = s.slice(0, s.length - token.length);
      return `${head} ${token}`.trim();
    }
  }
  return raw;
}

function normalizeTag(tag) {
  let t = normSpaces(String(tag || ""));
  if (!t) return null;

  // Undo snake_case outputs from the LLM.
  t = normSpaces(t.replace(/_/g, " "));
  t = normSpaces(splitLikelyJoinedTag(t));

  // Common formatting.
  t = t.replace(/\b24h\s*\/?\s*7j\b/gi, "24h/7j");
  t = t.replace(/\brevelys\b/gi, " ").trim();
  if (!t) return null;

  // Very specific stuck-case we observed in outputs.
  // Prefer splitting into 2 clean tags rather than keeping a broken token.
  if (!/\s/.test(t) && /propret/i.test(t) && /(sécurit|securit)/i.test(t)) {
    return ["propreté", "sécurité"];
  }

  return t;
}

function normalizeTags(tags, { max = 12 } = {}) {
  const out = [];
  const seen = new Set();

  const push = (v) => {
    if (!v) return;
    const raw = normSpaces(String(v));
    const t = TAGS_LOWERCASE ? raw.toLowerCase() : raw;
    if (!t) return;
    if (TAG_EMOTIONAL_OR_GENERIC.has(normalizeBceForTokenMatch(t))) return;
    const k = normalizeCityName(t);
    if (!k || seen.has(k)) return;
    seen.add(k);
    out.push(t);
  };

  for (const raw of Array.isArray(tags) ? tags : []) {
    const norm = normalizeTag(raw);
    if (Array.isArray(norm)) {
      for (const x of norm) push(x);
    } else {
      push(norm);
    }
    if (out.length >= max) break;
  }

  return out;
}

function enrichTagsWithServices(tags, services, { max = 12 } = {}) {
  const out = Array.isArray(tags) ? [...tags] : [];
  const seen = new Set(out.map((x) => normalizeCityName(x)).filter(Boolean));
  for (const svc of normalizeServicesList(services, { max: 6 })) {
    const candidate = TAGS_LOWERCASE ? String(svc).toLowerCase() : String(svc);
    const key = normalizeCityName(candidate);
    if (!key || seen.has(key)) continue;
    seen.add(key);
    out.push(candidate);
    if (out.length >= max) break;
  }
  return out.slice(0, max);
}

const GENERIC_SERVICE_LABELS = new Set([
  "service",
  "services",
  "prestation",
  "prestations",
  "intervention",
  "interventions",
  "depannage",
  "dépannage",
]);

function capitalizeFirst(s) {
  const v = String(s || "");
  if (!v) return v;
  return v[0].toUpperCase() + v.slice(1);
}

function normalizeServiceLabel(raw) {
  let s = normSpaces(String(raw || ""));
  if (!s) return null;

  s = s
    .replace(/^[\-â€¢*]+/, "")
    .replace(/\s*\/\s*/g, " ")
    .replace(/[;:,.]+$/g, "")
    .replace(/\brevelys\b/gi, " ")
    .trim();
  if (!s) return null;

  let lower = s.toLowerCase();
  lower = lower.replace(/\bchauffe[\s-]?eau(x)?\b/g, "chauffe-eau");
  lower = lower.replace(/\bwcs?\b/g, "wc");

  // Common EN -> FR rewrites observed in crawl outputs.
  lower = lower.replace(/\bproperty maintenance\b/g, "entretien de biens immobiliers");
  lower = lower.replace(/\bmaintenance services?\b/g, "services de maintenance");
  lower = lower.replace(/\bdoor opening\b/g, "ouverture de porte");
  lower = lower.replace(/\bgarage door repair\b/g, "réparation de porte de garage");
  lower = lower.replace(/\bair conditioning\b/g, "climatisation");
  lower = lower.replace(/\bheating maintenance\b/g, "entretien du chauffage");

  // Hand-tuned rewrites for common awkward outputs.
  lower = lower.replace(/\br[ée]paration\s+fuite(s)?\b/g, "réparation de fuites");
  lower = lower.replace(/\binstallation\s+sanitaires?\b/g, "installation de sanitaires");
  lower = lower.replace(/\bentretien\s+(du\s+)?chauffe-eau(x)?\b/g, "entretien du chauffe-eau");
  lower = lower.replace(/\bd[ée]bouchage\s+canalisations?\b/g, "débouchage de canalisations");
  lower = lower.replace(/\bd[ée]bouchage\b(?!\s+de\b)/g, "débouchage de canalisations");

  // General grammar fixes ("réparation fuite" -> "réparation de fuite", etc.).
  if (/^r[ée]paration\s+/.test(lower) && !/^r[ée]paration\s+d['eu]/.test(lower)) {
    lower = lower.replace(/^r[ée]paration\s+/, "réparation de ");
  }
  if (/^installation\s+/.test(lower) && !/^installation\s+d['eu]/.test(lower)) {
    lower = lower.replace(/^installation\s+/, "installation de ");
  }
  if (/^entretien\s+/.test(lower) && !/^entretien\s+(de|d'|du|des|de la|de l')\s+/.test(lower)) {
    lower = lower.replace(/^entretien\s+/, "entretien de ");
  }

  lower = normSpaces(lower);
  if (!lower) return null;
  if (GENERIC_SERVICE_LABELS.has(normalizeBceForTokenMatch(lower))) return null;

  return capitalizeFirst(lower);
}

function normalizeServicesList(raw, { max = 6 } = {}) {
  const source = Array.isArray(raw) ? raw : String(raw || "").split(",");
  const out = [];
  const seen = new Set();

  for (const item of source) {
    const svc = normalizeServiceLabel(item);
    if (!svc) continue;
    const key = normalizeBceForTokenMatch(svc);
    if (!key || seen.has(key)) continue;
    seen.add(key);
    out.push(svc);
    if (out.length >= max) break;
  }

  return out;
}

function servicesListToCsv(services) {
  const list = (Array.isArray(services) ? services : []).map((x) => normSpaces(x)).filter(Boolean);
  return list.length ? list.join(", ") : null;
}

function normalizeCityName(s) {
  return String(s || "")
    .toLowerCase()
    .normalize("NFD")
    .replace(/[\u0300-\u036f]/g, "")
    .replace(/[^a-z0-9]+/g, " ")
    .trim();
}

function sameCity(a, b) {
  const na = normalizeCityName(a);
  const nb = normalizeCityName(b);
  if (!na || !nb) return false;
  if (na === nb) return true;
  return na.includes(nb) || nb.includes(na);
}

function countRegexMatches(text, regex) {
  const src = String(text || "");
  if (!src || !regex) return 0;
  const flags = regex.flags.includes("g") ? regex.flags : `${regex.flags}g`;
  const re = new RegExp(regex.source, flags);
  let count = 0;
  for (const _m of src.matchAll(re)) count += 1;
  return count;
}

function isBrusselsCityHint(city) {
  const n = normalizeBceForTokenMatch(city);
  if (!n) return false;
  return /\b(?:bruxelles|brussels|brussel)\b/.test(n);
}

function cleanIndustryKeyword(s) {
  return normSpaces(String(s || ""))
    .replace(/[“”‘’]/g, "") // strip smart quotes; â is valid French
    .replace(/\s*\/\s*/g, " ")
    .trim();
}

function extractIndustrySearchTerms(industry) {
  const raw = String(industry || "").trim();
  if (!raw) return [];

  const parens = [];
  const base = raw.replace(/\(([^)]*)\)/g, (_, inside) => {
    if (inside) parens.push(String(inside));
    return " ";
  });

  const splitParts = (s) =>
    String(s || "")
      .split("/")
      .map((x) => cleanIndustryKeyword(x))
      .filter(Boolean);

  const parts = splitParts(base);
  const qualifiers = splitParts(parens.join(" / "));

  const primary = parts[0] || cleanIndustryKeyword(raw);

  const GENERIC_SINGLE = new Set(["entretien", "installation", "pose", "service", "services", "prestation", "prestations"]);

  const coreTerms = [];
  const extraTerms = [];
  const pushUnique = (arr, term) => {
    const v = cleanIndustryKeyword(term);
    if (!v) return;
    const key = normalizeCityName(v);
    if (!key) return;
    if (arr.some((x) => normalizeCityName(x) === key)) return;
    arr.push(v);
  };

  // Core: each main token between slashes.
  for (const p of parts) {
    const norm = normalizeCityName(p);
    if (GENERIC_SINGLE.has(norm)) continue;
    pushUnique(coreTerms, p);
  }

  // Extra combinations.
  if (parts.length >= 2) pushUnique(extraTerms, parts.join(" "));
  for (const q of qualifiers) {
    const norm = normalizeCityName(q);
    if (GENERIC_SINGLE.has(norm)) continue;
    pushUnique(extraTerms, `${primary} ${q}`);
  }

  // Always keep all core terms. SEARCH_TERMS_MAX only limits extras.
  const out = [...coreTerms];
  const maxTotal = Math.max(coreTerms.length, SEARCH_TERMS_MAX);
  for (const t of extraTerms) {
    if (out.length >= maxTotal) break;
    const key = normalizeCityName(t);
    if (!key) continue;
    if (out.some((x) => normalizeCityName(x) === key)) continue;
    out.push(t);
  }

  return out.length ? out : [cleanIndustryKeyword(raw)];
}

function canonicalIndustryLabel(industry) {
  const terms = extractIndustrySearchTerms(industry);
  return terms[0] || cleanIndustryKeyword(industry) || null;
}

function buildSearchQueries(industry, city) {
  const categoryIndustry = canonicalIndustryLabel(industry) || cleanIndustryKeyword(industry) || String(industry || "");
  const terms = extractIndustrySearchTerms(industry);
  const plans = [];
  const seen = new Set();

  for (const term of terms) {
    const query = `${term} ${city}`.trim();
    const key = normalizeCityName(query);
    if (!key || seen.has(key)) continue;
    seen.add(key);
    plans.push({ query, searchKeyword: term, categoryIndustry });
  }

  if (!plans.length) {
    const query = `${industry} ${city}`.trim();
    plans.push({ query, searchKeyword: categoryIndustry, categoryIndustry });
  }

  return plans;
}

function decodeBasicHtmlEntities(s) {
  const named = {
    amp: "&",
    quot: '"',
    apos: "'",
    lt: "<",
    gt: ">",
    nbsp: " ",
    eacute: "e",
    egrave: "e",
    ecirc: "e",
    agrave: "a",
    aacute: "a",
    acirc: "a",
    ccedil: "c",
    ugrave: "u",
    ucirc: "u",
    ocirc: "o",
    oe: "oe",
    rsquo: "'",
    lsquo: "'",
    rdquo: '"',
    ldquo: '"',
    ndash: "-",
    mdash: "-",
  };

  return String(s || "")
    .replace(/&([a-z][a-z0-9]+);?/gi, (m, entity) => {
      const key = String(entity || "").toLowerCase();
      return Object.prototype.hasOwnProperty.call(named, key) ? named[key] : m;
    })
    .replace(/&#x([0-9a-f]+);?/gi, (_, hex) => {
      const n = Number.parseInt(String(hex || ""), 16);
      if (!Number.isFinite(n) || n <= 0) return "";
      try {
        return String.fromCodePoint(n);
      } catch {
        return "";
      }
    })
    .replace(/&#([0-9]+);?/gi, (_, dec) => {
      const n = Number.parseInt(String(dec || ""), 10);
      if (!Number.isFinite(n) || n <= 0) return "";
      try {
        return String.fromCodePoint(n);
      } catch {
        return "";
      }
    });
}

function normalizeCompanyNameCandidate(raw) {
  if (!raw) return null;
  let v = decodeBasicHtmlEntities(raw);
  v = stripEmailsFromFreeText(v);
  v = stripPhonesFromFreeText(v);
  v = normSpaces(v);
  v = v.replace(/\s*[|\-]\s*(accueil|home|contact.*|mentions?.*|politique.*|privacy.*|cookie.*)$/i, "").trim();
  v = v.replace(/^(contact|contactez[-\s]?nous|mentions? legales?|a propos(?: de)?|about us|respect de la vie privee)\b[:\s-]*/i, "").trim();
  v = v.replace(/\b(click to open|scroll to top)\b.*$/i, "").trim();
  v = v.replace(/\b(mentions legales?|politique de confidentialite|traitement des donnees personnelles)\b.*$/i, "").trim();
  v = v.split("|")[0].trim();
  if (/\s-\s/.test(v)) {
    v = v.split(/\s-\s/)[0].trim();
  }
  {
    const dotParts = v.split(".");
    // Convert domain-like brand formatting into readable company name (e.g. "Reno.energy" -> "Reno Energy").
    if (
      dotParts.length >= 2 &&
      dotParts.every((part) => /^[A-Za-z0-9À-ÖØ-öø-ÿ]+$/u.test(part)) &&
      dotParts.some((part) => part.length >= 3)
    ) {
      v = dotParts.join(" ");
    }
  }
  if (v.includes(":")) {
    const parts = v.split(":");
    const head = normSpaces(parts.shift() || "");
    const tail = normSpaces(parts.join(":"));
    if (head && tail && tail.split(/\s+/).filter(Boolean).length >= 2) {
      v = head;
    }
  }
  v = v.replace(/\s+/g, " ");
  return v || null;
}

function isDomainLikeCompanyName(raw) {
  const v = normSpaces(String(raw || ""));
  if (!v) return false;
  return /^[a-z0-9][a-z0-9.-]+\.[a-z]{2,}$/i.test(v);
}

function looksLikeNoiseCompanyName(name) {
  const v = normSpaces(String(name || ""));
  if (!v) return true;
  const n = normalizeCityName(v);
  if (!n || n.length < 3) return true;
  if (isDomainLikeCompanyName(v)) return true;
  if (/^(nos?|votre|notre|vos|des)\b/.test(n)) return true;
  if (/^(contact|mentions legales|a propos|about|home|accueil|wordpress)\b/.test(n)) return true;
  if (/click to open|scroll to top|privacy|politique|cookies/.test(n)) return true;
  if (/solutions?\s+sur\s+mesure/.test(n)) return true;
  if (
    /(respect de la vie privee|traitement des donnees personnelles|mentions legales|votre expert|votre installateur|a la recherche de|societe de nettoyage)/.test(
      n
    )
  ) {
    return true;
  }
  // HTTP error pages (404, "Erreur 404", "Page not found", etc.)
  if (/\b404\b/.test(v)) return true;
  if (/^(erreur|error|page\s+not\s+found|not\s+found|introuvable|oops)\b/i.test(n)) return true;
  // Generic service-description names containing city hint (SEO titles used as names)
  // e.g. "Travaux d'égouts à Liège", "Débouchage à Liège 24H"
  if (/\b(egouts?|canalisations?|travaux\s+de)\b/.test(n)) return true;
  // Single-word pure trade term (e.g., "Crépis", "Plomberie", "Toiture")
  const normBce = normalizeBceForTokenMatch(v);
  if (normBce && !normBce.includes(" ") && GENERIC_TRADE_ALIAS_TOKENS.has(normBce)) return true;
  // Multi-word names made exclusively of trade tokens + connectors — no brand identifier
  // e.g., "Toiture et parachèvement", "Réparation et entretien"
  if (normBce && normBce.includes(" ")) {
    const allTok = normBce.split(" ").filter(Boolean);
    if (
      allTok.length >= 2 &&
      allTok.length <= 5 &&
      allTok.every((t) => GENERIC_TRADE_ALIAS_TOKENS.has(t) || COMPANY_CONNECTOR_TOKENS.has(t))
    )
      return true;
  }
  if (v.split(/\s+/).length >= 9) return true;
  if (/[?!]/.test(v)) return true;
  return false;
}

const COMPANY_NAME_MATCH_STOPWORDS = new Set([
  "srl",
  "sprl",
  "sa",
  "asbl",
  "sc",
  "scrl",
  "snc",
  "sr",
  "srlu",
  "bv",
  "bvba",
  "ltd",
  "inc",
  "entreprise",
  "enterprise",
  "company",
  "group",
  "limited",
  "solutions",
  "societe",
  "service",
  "services",
  "reservation",
  "booking",
  "artisan",
  "urgences",
  "urgence",
  "tarif",
  "devis",
  "gratuit",
  "intervention",
  "rapide",
  "plombier",
  "plomberie",
  "chauffagiste",
  "chauffage",
  "electricien",
  "serrurier",
  "vitrier",
  "debouchage",
  "toiture",
  "nettoyage",
  "support",
  "informatique",
  "marketing",
  "agence",
  "votre",
  "notre",
  "mon",
  "ma",
  "vos",
  "les",
  "des",
  "du",
  "de",
  "la",
  "le",
]);
const GENERIC_TRADE_ALIAS_TOKENS = new Set([
  "plombier",
  "plomberie",
  "debouchage",
  "chauffagiste",
  "chauffage",
  "electricien",
  "serrurier",
  "vitrier",
  "toiture",
  "isolation",
  "facade",
  "facadier",
  "nettoyage",
  "depannage",
  "entretien",
  "installation",
  "reparation",
  "sanitaire",
  "renovation",
  // Additional trade/service terms that should not appear in company names alone
  "egout",
  "egouts",
  "canalisation",
  "canalisations",
  "couvreur",
  "climatisation",
  "crepi",
  "crepis",
  "parachevement",
  "peinture",
  "peintre",
  "maconnerie",
  "macon",
  "menuiserie",
  "menuisier",
  "travaux",
  "service",
  "services",
  "urgence",
  "urgences",
  "construct",
  "construction",
  "enterprise",
  // English equivalents (some Belgian companies present themselves in English)
  "plumber",
  "plumbing",
  "electrician",
  "locksmith",
  "roofer",
  "roofing",
  "painter",
  "painting",
  "drainage",
  "heating",
  "glazier",
  "mason",
  "carpenter",
  "carpentry",
  "insulation",
]);
const COMPANY_CONNECTOR_TOKENS = new Set(["de", "du", "des", "la", "le", "les", "et", "a", "au", "sur"]);

function companyIdentityTokens(raw, maxTokens = 6) {
  const norm = normalizeBceForTokenMatch(raw);
  if (!norm) return [];
  const out = [];
  for (const token of norm.split(" ").filter(Boolean)) {
    if (token.length < 3) continue;
    if (/^\d+$/.test(token)) continue;
    if (COMPANY_NAME_MATCH_STOPWORDS.has(token)) continue;
    if (out.includes(token)) continue;
    out.push(token);
    if (out.length >= maxTokens) break;
  }
  return out;
}

function sharesCompanyIdentity(a, b) {
  const ta = companyIdentityTokens(a, 8);
  const tb = companyIdentityTokens(b, 8);
  if (!ta.length || !tb.length) return false;
  const setB = new Set(tb);
  return ta.some((t) => setB.has(t));
}

function looksLikeTradeCityAliasCompanyName(raw) {
  const norm = normalizeCityName(raw);
  if (!norm) return false;

  const tokens = norm.split(" ").filter(Boolean);
  if (tokens.length < 2 || tokens.length > 5) return false;

  let hasTrade = false;
  let hasCity = false;
  let brandTokens = 0;

  for (const token of tokens) {
    if (/^\d+$/.test(token)) continue;
    if (GENERIC_TRADE_ALIAS_TOKENS.has(token)) {
      hasTrade = true;
      continue;
    }
    if (CITY_HINT_TOKENS.has(token)) {
      hasCity = true;
      continue;
    }
    if (COMPANY_CONNECTOR_TOKENS.has(token)) continue;
    if (token.length >= 3) brandTokens += 1;
  }

  return hasTrade && hasCity && brandTokens === 0;
}

function looksLikeSeoCatchallCompanyName(raw) {
  const n = normalizeBceForTokenMatch(raw);
  if (!n) return false;
  if (looksLikeTradeCityAliasCompanyName(raw)) return true;
  if (/\b(urgences?|tarif|competitif|devis|gratuit|intervention|rapide|24|7j)\b/.test(n)) return true;
  if (/\bsolutions?\s+sur\s+mesure\b/.test(n)) return true;

  const hasTradeLexicon =
    /\b(plombier|plomberie|chauffagiste|chauffage|electricien|serrurier|vitrier|debouchage|toiture|nettoyage|support|informatique|agence|service|artisan|chassis|menuiserie|facade|facadier|crepi|crepis|parachevement|isolation|construct|construction|enterprise|plumber|plumbing|electrician|locksmith|roofer|roofing|painter|painting|drainage|heating|glazier|carpenter|carpentry|insulation)\b/.test(
      n
    );
  const startsGeneric = /^(votre|notre|nos?|mon|ma|vos|le|la|les)\b/.test(n);
  const words = n.split(" ").filter(Boolean);
  if (hasTradeLexicon && /\b(reservation|booking)\b/.test(n)) return true;
  if (hasTradeLexicon && /\bservice\b/.test(n) && words.length <= 4 && companyIdentityTokens(raw, 8).length === 0) return true;
  if (hasTradeLexicon && startsGeneric) return true;
  if (hasTradeLexicon && words.length >= 5 && /\b(a|de|du|des)\b/.test(n)) return true;
  return false;
}

function titleCaseBusinessName(s) {
  const orig = normSpaces(String(s || ""));
  if (!orig) return null;
  // Preserve tokens that are already all-uppercase and 2-4 chars (acronyms like AMC, NJ, HC, DCS).
  const acronymTokens = new Set(
    orig.split(/\s+/).filter((t) => t.length >= 2 && t.length <= 4 && /^[A-Z]+$/.test(t))
  );
  let v = orig.toLowerCase();
  v = v.replace(/(^|[\s\-'])([a-z])/g, (_, p1, p2) => `${p1}${p2.toUpperCase()}`);
  v = v.replace(/\b(srl|sprl|sa|asbl|sc|scrl|snc|bv|bvba)\b/g, (m) => m.toUpperCase());
  // Normalize English words to French equivalents in display names.
  v = v.replace(/\bEnterprise\b/g, "Entreprise");
  v = v.replace(/\b(And|AND)\b/g, "et");
  // Restore preserved acronyms
  if (acronymTokens.size) {
    v = v.replace(/\b[A-Za-z]{2,4}\b/g, (m) => (acronymTokens.has(m.toUpperCase()) ? m.toUpperCase() : m));
  }
  return v;
}

function pickCompanyName({ siteName, bceLegalName, fallbackTitle }) {
  const site = normalizeCompanyNameCandidate(siteName);
  const legal = titleCaseBusinessName(bceLegalName);
  const title = normalizeCompanyNameCandidate(fallbackTitle);
  const siteC = site ? clampLine(site, 120) : null;
  const legalC = legal ? clampLine(legal, 120) : null;
  const titleC = title ? clampLine(title, 120) : null;

  const siteOk = siteC && !looksLikeNoiseCompanyName(siteC);
  const legalOk = legalC && !looksLikeNoiseCompanyName(legalC);
  const titleOk = titleC && !looksLikeNoiseCompanyName(titleC);

  // Prefer BCE legal name when the site label looks like a generic SEO headline.
  if (siteOk && legalOk && looksLikeSeoCatchallCompanyName(siteC) && !sharesCompanyIdentity(siteC, legalC)) {
    return legalC;
  }

  // Only return siteC directly if it's not an SEO catchall, OR if we have a legal name
  // that already validated the site identity (handled above). Without a BCE fallback,
  // an SEO-catchall site name like "Plombier Liège" must be rejected.
  if (siteOk && (!looksLikeSeoCatchallCompanyName(siteC) || legalOk)) return siteC;
  if (legalOk) return legalC;
  // Same gate for title: don't use an SEO-catchall title when there is no BCE name.
  if (titleOk && (!looksLikeSeoCatchallCompanyName(titleC) || legalOk)) return titleC;

  // Hard fail-safe: never return values already classified as noisy or as SEO catchalls.
  // BCE legal name is exempt from the SEO filter (official registration can look trade-like).
  const safeFallback = [legalC, siteC, titleC].find(
    (x) => x && !looksLikeNoiseCompanyName(x) && (x === legalC || !looksLikeSeoCatchallCompanyName(x))
  );
  return safeFallback || null;
}

function safeUrl(u) {
  try {
    return new URL(u).toString();
  } catch {
    return null;
  }
}

function domainOf(u) {
  try {
    return new URL(u).hostname.replace(/^www\./, "");
  } catch {
    return "";
  }
}

function blockedDomainMatches(host, blockedRaw) {
  const d = String(host || "").toLowerCase().replace(/^www\./, "");
  let b = String(blockedRaw || "").trim().toLowerCase().replace(/^www\./, "");
  if (!d || !b) return false;
  if (b.includes("://")) {
    try {
      b = new URL(b).hostname.toLowerCase().replace(/^www\./, "");
    } catch {
      // keep raw fallback
    }
  }
  if (!b) return false;
  if (b.includes(".")) return d === b || d.endsWith(`.${b}`);
  return d.includes(b);
}

function isBlacklisted(u) {
  const d = domainOf(u);
  return BLACKLIST_DOMAINS.some((b) => blockedDomainMatches(d, b));
}

function nonOfficialSiteReason({ url, title = "", snippet = "", siteName = "", corpus = "", targetCity = "" }) {
  const u = safeUrl(url || "");
  if (!u) return "invalid_url";

  if (isBlacklisted(u)) return "blacklisted_domain";

  const host = domainOf(u);
  if (!host) return "invalid_domain";
  if (NON_OFFICIAL_HOST_HINTS.some((x) => host.includes(x))) return "directory_domain";
  if (host.endsWith(".brussels") && !isBrusselsCityHint(targetCity)) return "out_of_area_brussels_domain";
  if (NON_OFFICIAL_PUBLIC_HOST_RE.test(host)) return "public_institution_domain";

  try {
    const parsed = new URL(u);
    if (NON_OFFICIAL_PATH_RE.test(parsed.pathname || "")) return "directory_path";
  } catch {
    // ignore
  }

  const metaText = normalizeBceForTokenMatch([title, snippet, siteName].filter(Boolean).join(" "));
  if (metaText) {
    const publicStrongMeta = countRegexMatches(metaText, NON_OFFICIAL_PUBLIC_STRONG_TEXT_RE);
    const publicWeakMeta = countRegexMatches(metaText, NON_OFFICIAL_PUBLIC_WEAK_TEXT_RE);
    const commercialMeta = NON_OFFICIAL_COMMERCIAL_TEXT_RE.test(metaText);
    if (publicStrongMeta >= 1 || (publicWeakMeta >= 3 && !commercialMeta)) {
      return "public_institution_meta";
    }
  }
  if (metaText && NON_OFFICIAL_TEXT_RE.test(metaText) && NON_OFFICIAL_CONTEXT_RE.test(metaText)) {
    return "directory_meta";
  }

  const body = normalizeBceForTokenMatch(String(corpus || "").slice(0, 7000));
  if (body) {
    const publicStrongBody = countRegexMatches(body, NON_OFFICIAL_PUBLIC_STRONG_TEXT_RE);
    const publicWeakBody = countRegexMatches(body, NON_OFFICIAL_PUBLIC_WEAK_TEXT_RE);
    const commercialBody = NON_OFFICIAL_COMMERCIAL_TEXT_RE.test(body);
    if (publicStrongBody >= 1 || (publicWeakBody >= 3 && !commercialBody)) {
      return "public_institution_content";
    }
  }
  const hasDirectoryLexicon = /\b(?:annuaire|comparateur|plateforme|platform|marketplace)\b/.test(body);
  const hasListingContext = /\b(?:entreprises?|professionnels?|prestataires?|restaurants?|traiteurs?|artisans?)\b/.test(body);
  if (hasDirectoryLexicon && hasListingContext) return "directory_content";

  return null;
}

function frSlugify(v) {
  // Normalize English words to French equivalents before slugifying.
  const normalized = normSpaces(v || "")
    .replace(/\s+&\s+/g, " et ")
    .replace(/\band\b/gi, "et")
    .replace(/\benterprise\b/gi, "entreprise")
    .replace(/\bconstruct\b/gi, "construction");
  return slugify(normalized, { lower: true, strict: true, locale: "fr" });
}

function makeSlug({ industryKeyword, city, name }) {
  const keyword = cleanIndustryKeyword(industryKeyword);
  const slugParts = [keyword, city, name]
    .map((v) => frSlugify(v))
    .filter(Boolean);

  let base = slugParts.join("-");
  if (!base) base = frSlugify(name || keyword || city || "entreprise");
  base = base.replace(/-+/g, "-").replace(/^-|-$/g, "");

  const MAX_BASE_LEN = 80;
  if (base.length > MAX_BASE_LEN) base = base.slice(0, MAX_BASE_LEN).replace(/-+$/g, "");
  if (!base) base = "entreprise";

  return base;
}

function reserveUniqueSlug(slugBase, usedSlugs) {
  const base = slugBase || "entreprise";
  let slug = base;
  let i = 2;
  while (usedSlugs.has(slug)) {
    slug = `${base}-${i}`;
    i += 1;
  }
  usedSlugs.add(slug);
  return slug;
}

function escapeSqlString(s) {
  return stripUnsafeUnicodeChars(String(s)).replace(/'/g, "''");
}

function sqlValue(v) {
  if (v === null || v === undefined) return "NULL";
  if (typeof v === "boolean") return v ? "TRUE" : "FALSE";
  if (typeof v === "number") return Number.isFinite(v) ? String(v) : "NULL";
  if (typeof v === "string") return `'${escapeSqlString(v)}'`;

  // Arrays: either text[] (strings) or jsonb (array of objects)
  if (Array.isArray(v)) {
    const hasObject = v.some((x) => x && typeof x === "object" && !Array.isArray(x));
    if (hasObject) {
      const json = JSON.stringify(v);
      return `'${escapeSqlString(json)}'::jsonb`;
    }

    const arr = v
      .filter((x) => x !== null && x !== undefined)
      .map((x) => `'${escapeSqlString(String(x))}'`);
    return `ARRAY[${arr.join(", ")}]::text[]`;
  }

  // Objects => jsonb
  const json = JSON.stringify(v);
  return `'${escapeSqlString(json)}'::jsonb`;
}

// -------------------- PRICING (OPTIONAL) --------------------
const PRICING_MODELS = new Set(["sur_devis", "forfait", "horaire", "a_partir_de"]);

function normalizePricingModel(v) {
  if (v === null || v === undefined) return null;
  const s = String(v).trim();
  if (!s) return null;
  return PRICING_MODELS.has(s) ? s : null;
}

function normalizeBudgetLevel(v) {
  if (v === null || v === undefined) return null;
  const n = typeof v === "string" ? parseInt(v, 10) : Number(v);
  if (!Number.isFinite(n)) return null;
  const i = Math.round(n);
  if (i < 1 || i > 3) return null;
  return i;
}

function budgetLevelFromPriceRange(priceRange) {
  if (!priceRange) return null;
  const s = String(priceRange);
  const euros = (s.match(/â‚¬/g) || []).length;
  const dollars = (s.match(/\$/g) || []).length;
  const n = Math.max(euros, dollars);
  if (!n) return null;
  if (n <= 1) return 1;
  if (n === 2) return 2;
  return 3;
}

function normalizePriceIndication(v) {
  if (v === null || v === undefined) return null;
  const s = normSpaces(String(v));
  if (!s) return null;
  return s.length > 160 ? s.slice(0, 160) : s;
}

function pricingModelFromText(text) {
  const t = String(text || "").toLowerCase();
  if (!t) return null;
  if (/(â‚¬\s*\/\s*h|eur\s*\/\s*h|\bpar\s+heure\b|\bhoraire\b|\btaux\s+horaire\b)/.test(t)) return "horaire";
  if (/\bforfait(s)?\b/.test(t)) return "forfait";
  if (/\b(a|à)\s+partir\s+de\b/.test(t)) return "a_partir_de";
  if (/\bdevis\b/.test(t)) return "sur_devis";
  return null;
}

function sleep(ms) {
  return new Promise((r) => setTimeout(r, ms));
}

async function fetchWithTimeout(url, options = {}, timeoutMs = FETCH_TIMEOUT_MS) {
  const controller = new AbortController();
  const to = setTimeout(() => controller.abort(), timeoutMs);
  try {
    return await fetch(url, { ...options, signal: controller.signal });
  } finally {
    clearTimeout(to);
  }
}

// -------------------- BCE --------------------
const BCE_ACTIVE_TOKENS = ["actif", "active", "actief"];
const BCE_BLOCKING_TOKENS = [
  "inactif",
  "inactive",
  "liquid",
  "radi",
  "cess",
  "faillit",
  "faillite",
  "bankrupt",
  "dissolution",
  "dissous",
  "annul",
  "arrete",
  "arret",
  "stopzetting",
  "ontbinding",
  "sluiting",
];

function normalizeBceDigits(raw) {
  const digits = String(raw || "").replace(/\D/g, "");
  if (!/^\d{10}$/.test(digits)) return null;
  return digits;
}

function normalizeBceForTokenMatch(s) {
  return String(s || "")
    .toLowerCase()
    .normalize("NFD")
    .replace(/[\u0300-\u036f]/g, "")
    .replace(/[^a-z0-9]+/g, " ")
    .trim();
}

function parseDdMmYyyyToIso(raw) {
  const s = String(raw || "").trim();
  const m = s.match(/^(\d{2})-(\d{2})-(\d{4})$/);
  if (!m) return null;
  const dd = m[1];
  const mm = m[2];
  const yyyy = m[3];
  return `${yyyy}-${mm}-${dd}`;
}

function formatBceForUi(bce10) {
  if (!/^\d{10}$/.test(String(bce10 || ""))) return null;
  return `BE${bce10}`;
}

function parseCsvLine(line) {
  const out = [];
  let cur = "";
  let inQuotes = false;

  for (let i = 0; i < line.length; i++) {
    const ch = line[i];
    if (ch === '"') {
      if (inQuotes && line[i + 1] === '"') {
        cur += '"';
        i += 1;
      } else {
        inQuotes = !inQuotes;
      }
      continue;
    }
    if (ch === "," && !inQuotes) {
      out.push(cur);
      cur = "";
      continue;
    }
    cur += ch;
  }
  out.push(cur);
  return out;
}

async function forEachCsvRow(filePath, onRow) {
  const rl = readline.createInterface({
    input: fs.createReadStream(filePath, { encoding: "utf-8" }),
    crlfDelay: Infinity,
  });

  let isFirst = true;
  for await (const rawLine of rl) {
    let line = String(rawLine || "");
    if (isFirst) {
      isFirst = false;
      line = line.replace(/^\uFEFF/, "");
    }
    if (!line) continue;
    const row = parseCsvLine(line);
    if (!row.length) continue;
    if (row[0] === "EnterpriseNumber" || row[0] === "Variable" || row[0] === "Category" || row[0] === "EntityNumber") {
      continue;
    }
    onRow(row);
  }
}

function makeBceCodeKey(category, code) {
  return `${String(category || "").trim()}|${String(code || "").trim()}`;
}

function pickCodeDescription(entry, preferredLanguage = BCE_LANGUAGE) {
  if (!entry) return null;
  const order = [preferredLanguage, "FR", "NL", "DE"];
  for (const lang of order) {
    const v = entry[lang];
    if (v) return normSpaces(v);
  }
  for (const v of Object.values(entry)) {
    if (v) return normSpaces(v);
  }
  return null;
}

function normalizePublicUrl(raw, baseUrl = null) {
  let s = normSpaces(raw);
  if (!s) return null;
  if (/^(mailto:|tel:|callto:|javascript:)/i.test(s)) return null;
  if (s.startsWith("//")) s = `https:${s}`;
  if (!/^[a-z][a-z0-9+.-]*:\/\//i.test(s)) s = `https://${s.replace(/^\/+/, "")}`;
  const abs = baseUrl ? absolutize(s, baseUrl) : s;
  return safeUrl(abs || s);
}

function isSupabaseStorageUrl(raw) {
  if (!raw || !SUPABASE_HOST) return false;
  try {
    const u = new URL(String(raw));
    return u.hostname.toLowerCase() === SUPABASE_HOST && /^\/storage\/v1\/object\/public\//i.test(u.pathname);
  } catch {
    return false;
  }
}

function isIgnoredSocialUrl(url) {
  const u = safeUrl(url);
  if (!u) return true;
  try {
    const parsed = new URL(u);
    const host = parsed.hostname.toLowerCase();
    const path = parsed.pathname.toLowerCase();
    const segs = path.split("/").filter(Boolean).map((x) => x.toLowerCase());
    const full = `${host}${path}${parsed.search}`.toLowerCase();

    if (/facebook\.com\/profile\.php/.test(full)) return true;
    if (
      /(axiomthemes|themeforest|envato|oceanwp|wpbakery|elementor|avada|flatsome|divi|beaverbuilder|wordpress|wix)/.test(
        full
      )
    ) {
      return true;
    }
    if (/\/(share|sharer|plugins|intent)\b/.test(path)) return true;
    if (/\/(privacy|policies|policy|help|terms|legal|login|checkpoint)\b/.test(path)) return true;

    const rootOrEmpty = segs.length === 0;
    if (host.endsWith("facebook.com") || host.endsWith("fb.com") || host === "fb.me") {
      const blocked = new Set([
        "pages",
        "watch",
        "groups",
        "events",
        "marketplace",
        "gaming",
        "search",
        "hashtag",
        "about",
        "business",
      ]);
      if (rootOrEmpty || blocked.has(segs[0])) return true;
    }
    if (host.endsWith("instagram.com")) {
      const blocked = new Set(["p", "reel", "reels", "explore", "accounts", "stories", "about", "developer", "legal"]);
      if (rootOrEmpty || blocked.has(segs[0])) return true;
    }
    if (host.endsWith("linkedin.com") || host.endsWith("linkedin.be")) {
      const blocked = new Set(["feed", "jobs", "learning", "help", "signup", "authwall", "news", "pulse"]);
      if (rootOrEmpty || blocked.has(segs[0])) return true;
    }
    if (host.endsWith("tiktok.com")) {
      const head = segs[0] || "";
      const blocked = new Set(["discover", "tag", "foryou", "login", "about"]);
      if (rootOrEmpty || blocked.has(head)) return true;
      if (!head.startsWith("@")) return true;
    }
  } catch {
    return true;
  }
  return false;
}

function normalizeSocialProfileUrl(rawUrl, baseUrl = null) {
  const u = normalizePublicUrl(rawUrl, baseUrl);
  if (!u) return null;
  if (isIgnoredSocialUrl(u)) return null;
  return u;
}

function classifySocialUrl(rawUrl, baseUrl = null) {
  const u = normalizePublicUrl(rawUrl, baseUrl);
  if (!u) return null;
  if (isIgnoredSocialUrl(u)) return null;
  const host = domainOf(u);
  if (!host) return null;
  if (host.endsWith("facebook.com") || host.endsWith("fb.com") || host === "fb.me") return { platform: "facebook", url: u };
  if (host.endsWith("instagram.com")) return { platform: "instagram", url: u };
  if (host.endsWith("linkedin.com") || host.endsWith("linkedin.be")) return { platform: "linkedin", url: u };
  if (host.endsWith("tiktok.com")) return { platform: "tiktok", url: u };
  return null;
}

function parseNumberLike(v) {
  if (v === null || v === undefined) return null;
  const s = String(v).trim().replace(",", ".");
  if (!s) return null;
  const n = Number(s);
  return Number.isFinite(n) ? n : null;
}

function normalizeLlmConfidence(v) {
  const n = parseNumberLike(v);
  if (n === null) return null;
  if (n <= 1) return Math.max(0, Math.min(100, Math.round(n * 100)));
  return Math.max(0, Math.min(100, Math.round(n)));
}

function normalizeRasScore(v) {
  const n = parseNumberLike(v);
  if (n === null) return null;
  if (n <= 1) return Math.max(0, Math.min(100, Math.round(n * 100)));
  return Math.max(0, Math.min(100, Math.round(n)));
}

function buildCalibratedRasScore({
  llmRasScore,
  llmConfidence = null,
  servicesCount = 0,
  ritualsCount = 0,
  tagsCount = 0,
  hasCorporateEmail = false,
  hasEmail = false,
  hasPhone = false,
  hasAddress = false,
  hasCity = false,
  hasFaq = false,
  hasWebsite = false,
  socialCount = 0,
  galleryCount = 0,
  hasCover = false,
  recoveredBceByIdentity = false,
  seed = "",
} = {}) {
  let score = normalizeRasScore(llmRasScore);
  if (score === null) score = 62;

  if (llmConfidence !== null) score += Math.round((llmConfidence - 70) / 7);

  if (servicesCount >= 5) score += 4;
  else if (servicesCount >= 3) score += 2;
  else if (servicesCount <= 1) score -= 6;

  if (ritualsCount >= 3) score += 3;
  else if (ritualsCount === 2) score += 1;
  else score -= 4;

  if (tagsCount >= 8) score += 2;
  else if (tagsCount <= 4) score -= 2;

  if (hasCorporateEmail) score += 4;
  else if (hasEmail) score += 1;
  else score -= 8;

  if (hasPhone) score += 3;
  else score -= 5;

  if (hasAddress && hasCity) score += 3;
  else if (hasCity) score += 1;
  else score -= 5;

  if (hasWebsite) score += 2;
  if (hasFaq) score += 2;

  if (socialCount >= 2) score += 2;
  else if (socialCount === 0) score -= 1;

  if (hasCover && galleryCount >= 4) score += 3;
  else if (!hasCover || galleryCount < 2) score -= 2;

  if (recoveredBceByIdentity) score -= 2;

  const spreadSeed = seed || `${servicesCount}|${ritualsCount}|${tagsCount}|${galleryCount}|${socialCount}`;
  const spread = (Number.parseInt(sha256Hex(spreadSeed).slice(0, 2), 16) % 7) - 3; // -3..+3 deterministic spread
  score += spread;

  return Math.max(35, Math.min(98, Math.round(score)));
}

function hasScalarValue(v) {
  if (v === null || v === undefined) return false;
  if (typeof v === "string") return v.trim().length > 0;
  return true;
}

function normalizeDateToIso(raw) {
  const s = normSpaces(raw);
  if (!s) return null;
  const d = new Date(s);
  if (Number.isNaN(d.getTime())) return null;
  return d.toISOString().slice(0, 10);
}

function dayToFr(day) {
  const d = String(day || "")
    .toLowerCase()
    .replace(/^https?:\/\/schema\.org\//, "")
    .replace(/\./g, "")
    .trim();

  const map = {
    monday: "lundi",
    mon: "lundi",
    tuesday: "mardi",
    tue: "mardi",
    wednesday: "mercredi",
    wed: "mercredi",
    thursday: "jeudi",
    thu: "jeudi",
    friday: "vendredi",
    fri: "vendredi",
    saturday: "samedi",
    sat: "samedi",
    sunday: "dimanche",
    sun: "dimanche",
    lundi: "lundi",
    lun: "lundi",
    mardi: "mardi",
    mar: "mardi",
    mercredi: "mercredi",
    mer: "mercredi",
    jeudi: "jeudi",
    jeu: "jeudi",
    vendredi: "vendredi",
    ven: "vendredi",
    samedi: "samedi",
    sam: "samedi",
    dimanche: "dimanche",
    dim: "dimanche",
  };
  return map[d] || null;
}

const WEEK_DAY_ORDER = ["lundi", "mardi", "mercredi", "jeudi", "vendredi", "samedi", "dimanche"];

function normalizeHour(raw) {
  const s = String(raw || "").trim();
  if (!s) return null;
  const m = s.match(/^(\d{1,2}):(\d{2})(?::\d{2})?$/);
  if (!m) return null;
  const h = Math.max(0, Math.min(23, Number(m[1])));
  const min = Math.max(0, Math.min(59, Number(m[2])));
  return `${String(h).padStart(2, "0")}:${String(min).padStart(2, "0")}`;
}

function normalizeHourFlexible(raw) {
  const s = String(raw || "")
    .toLowerCase()
    .replace(/\./g, ":")
    .trim();
  if (!s) return null;

  const m = s.match(/^(\d{1,2})(?:\s*(?:h|:)\s*(\d{1,2}))?$/i);
  if (!m) return null;

  const h = Number(m[1]);
  const min = m[2] !== undefined && m[2] !== "" ? Number(m[2]) : 0;
  if (!Number.isFinite(h) || !Number.isFinite(min)) return null;
  if (h < 0 || h > 23 || min < 0 || min > 59) return null;

  return `${String(h).padStart(2, "0")}:${String(min).padStart(2, "0")}`;
}

function hourToMinutes(hhmm) {
  const m = String(hhmm || "").match(/^(\d{2}):(\d{2})$/);
  if (!m) return null;
  const h = Number(m[1]);
  const min = Number(m[2]);
  if (!Number.isFinite(h) || !Number.isFinite(min)) return null;
  if (h < 0 || h > 23 || min < 0 || min > 59) return null;
  return h * 60 + min;
}

function isValidOpeningRange(open, close) {
  const o = hourToMinutes(open);
  const c = hourToMinutes(close);
  if (o === null || c === null) return false;
  // Strict rule requested: keep only same-day ranges where close > open.
  return c > o;
}

function expandDayRange(fromDay, toDay) {
  const fromIdx = WEEK_DAY_ORDER.indexOf(fromDay);
  const toIdx = WEEK_DAY_ORDER.indexOf(toDay);
  if (fromIdx === -1 || toIdx === -1) return [];
  if (fromIdx <= toIdx) return WEEK_DAY_ORDER.slice(fromIdx, toIdx + 1);
  return [...WEEK_DAY_ORDER.slice(fromIdx), ...WEEK_DAY_ORDER.slice(0, toIdx + 1)];
}

function extractOpeningHoursFromJsonLd(flatJsonlds) {
  const byDay = new Map();

  const pushSpec = (spec) => {
    if (!spec || typeof spec !== "object") return;
    const daysRaw = Array.isArray(spec.dayOfWeek) ? spec.dayOfWeek : [spec.dayOfWeek];
    const open = normalizeHour(spec.opens);
    const close = normalizeHour(spec.closes);
    for (const d of daysRaw) {
      const day = dayToFr(d);
      if (!day) continue;
      const isClosed = !open || !close;
      if (!byDay.has(day)) {
        byDay.set(day, isClosed ? { day, isClosed: true } : { day, open, close, isClosed: false });
      }
    }
  };

  for (const obj of flatJsonlds || []) {
    const specs = obj?.openingHoursSpecification;
    if (Array.isArray(specs)) {
      for (const spec of specs) pushSpec(spec);
    } else {
      pushSpec(specs);
    }
  }

  if (!byDay.size) return null;

  return WEEK_DAY_ORDER.filter((d) => byDay.has(d)).map((d) => byDay.get(d));
}

function extractOpeningHoursFromText(text) {
  const src = String(text || "");
  if (!src) return null;

  const byDay = new Map();
  const setDay = (day, open, close, isClosed = false) => {
    if (!day || byDay.has(day)) return;
    if (isClosed) {
      byDay.set(day, { day, isClosed: true });
      return;
    }
    if (open && close) byDay.set(day, { day, open, close, isClosed: false });
  };

  const dayPattern =
    "(lundi|lun\\.?|mardi|mar\\.?|mercredi|mer\\.?|jeudi|jeu\\.?|vendredi|ven\\.?|samedi|sam\\.?|dimanche|dim\\.?|monday|mon\\.?|tuesday|tue\\.?|wednesday|wed\\.?|thursday|thu\\.?|friday|fri\\.?|saturday|sat\\.?|sunday|sun\\.?)";
  const hourPattern = "(\\d{1,2}(?:\\s*(?:h|:)\\s*\\d{0,2})?)";
  const closedPattern = "(ferme|closed)";

  const rangeHoursRe = new RegExp(
    `${dayPattern}\\s*(?:au|a|à|to|\\-|–)\\s*${dayPattern}\\s*[:\\-–]?\\s*${hourPattern}\\s*(?:-|–|a|à|to)\\s*${hourPattern}`,
    "gi"
  );
  let m;
  while ((m = rangeHoursRe.exec(src)) !== null) {
    const fromDay = dayToFr(m[1]);
    const toDay = dayToFr(m[2]);
    const open = normalizeHourFlexible(m[3]);
    const close = normalizeHourFlexible(m[4]);
    if (!fromDay || !toDay || !open || !close) continue;
    for (const day of expandDayRange(fromDay, toDay)) setDay(day, open, close, false);
  }

  const singleHoursRe = new RegExp(`${dayPattern}\\s*[:\\-–]?\\s*${hourPattern}\\s*(?:-|–|a|à|to)\\s*${hourPattern}`, "gi");
  while ((m = singleHoursRe.exec(src)) !== null) {
    const day = dayToFr(m[1]);
    const open = normalizeHourFlexible(m[2]);
    const close = normalizeHourFlexible(m[3]);
    if (!day || !open || !close) continue;
    setDay(day, open, close, false);
  }

  const rangeClosedRe = new RegExp(`${dayPattern}\\s*(?:au|a|à|to|\\-|–)\\s*${dayPattern}\\s*[:\\-–]?\\s*${closedPattern}`, "gi");
  while ((m = rangeClosedRe.exec(src)) !== null) {
    const fromDay = dayToFr(m[1]);
    const toDay = dayToFr(m[2]);
    if (!fromDay || !toDay) continue;
    for (const day of expandDayRange(fromDay, toDay)) setDay(day, null, null, true);
  }

  const singleClosedRe = new RegExp(`${dayPattern}\\s*[:\\-–]?\\s*${closedPattern}`, "gi");
  while ((m = singleClosedRe.exec(src)) !== null) {
    const day = dayToFr(m[1]);
    if (!day) continue;
    setDay(day, null, null, true);
  }

  if (!byDay.size) return null;
  return WEEK_DAY_ORDER.filter((d) => byDay.has(d)).map((d) => byDay.get(d));
}

function openingHoursQuality(hours) {
  if (!Array.isArray(hours) || !hours.length) return 0;
  let score = 0;
  for (const x of hours) {
    if (!x || !x.day) continue;
    score += 1;
    if (x.isClosed === false && x.open && x.close) score += 2;
  }
  return score;
}

function pickBestOpeningHours(current, candidate) {
  const a = openingHoursQuality(current);
  const b = openingHoursQuality(candidate);
  return b > a ? candidate : current;
}

function sanitizeOpeningHours(hours) {
  if (!Array.isArray(hours) || !hours.length) return null;

  const byDay = new Map();
  for (const row of hours) {
    if (!row || typeof row !== "object") continue;
    const day = dayToFr(row.day);
    if (!day || byDay.has(day)) continue;

    if (row.isClosed === true) {
      byDay.set(day, { day, isClosed: true });
      continue;
    }

    const open = normalizeHour(row.open);
    const close = normalizeHour(row.close);
    if (!open || !close) continue;
    if (!isValidOpeningRange(open, close)) continue;
    byDay.set(day, { day, open, close, isClosed: false });
  }

  const out = WEEK_DAY_ORDER.filter((d) => byDay.has(d)).map((d) => byDay.get(d));
  return out.length ? out : null;
}

function extractGoogleSignalsFromJsonLd(flatJsonlds) {
  let bestRating = null;
  let bestCount = null;
  const reviews = [];
  const seenReviews = new Set();

  const collectReview = (r) => {
    if (!r || typeof r !== "object") return;
    const author =
      typeof r.author === "string" ? normSpaces(r.author) : typeof r.author?.name === "string" ? normSpaces(r.author.name) : null;
    const text = normSpaces(r.reviewBody || r.description || "");
    const rating = parseNumberLike(r.reviewRating?.ratingValue || r.ratingValue);
    const publishedAt = normalizeDateToIso(r.datePublished);
    const originalUrl = normalizePublicUrl(r.url);
    if (!text) return;
    const key = `${author || ""}|${publishedAt || ""}|${text.slice(0, 120)}`;
    if (seenReviews.has(key)) return;
    seenReviews.add(key);
    reviews.push({
      author: author || "Client",
      text: text.slice(0, 800),
      rating: rating !== null ? Math.max(0, Math.min(5, rating)) : null,
      publishedAt: publishedAt || null,
      originalUrl: originalUrl || null,
    });
  };

  for (const obj of flatJsonlds || []) {
    const agg = obj?.aggregateRating;
    if (agg && typeof agg === "object") {
      const rv = parseNumberLike(agg.ratingValue);
      const rc = parseNumberLike(agg.reviewCount);
      if (rv !== null) {
        if (bestCount === null || (rc !== null && rc > bestCount) || bestRating === null) {
          bestRating = Math.max(0, Math.min(5, rv));
          bestCount = rc !== null ? Math.max(0, Math.round(rc)) : bestCount;
        }
      }
    }

    const rev = obj?.review;
    if (Array.isArray(rev)) {
      for (const r of rev) collectReview(r);
    } else {
      collectReview(rev);
    }
  }

  return {
    google_rating: bestRating,
    google_reviews_count: bestCount,
    google_reviews: reviews.slice(0, 10),
  };
}

function normalizeFounderRoleLabel(raw) {
  const t = normalizeBceForTokenMatch(raw);
  if (!t) return null;
  if (/co fondateur|co founder|cofondateur|cofounder/.test(t)) return "Co-fondateur";
  if (/fondateur|founder/.test(t)) return "Fondateur";
  if (/administrateur delegue/.test(t)) return "Administrateur delegue";
  if (/directeur general|directrice generale/.test(t)) return "Directeur general";
  if (/ceo/.test(t)) return "CEO";
  if (/gerant|gérant|owner|proprietaire|managing director/.test(t)) return "Gerant";
  return null;
}

function looksLikePersonName(name, companyNameHint = null) {
  let v = normSpaces(String(name || ""));
  if (!v) return false;
  v = v.replace(/^(monsieur|madame|mme|mr|mrs|dr)\.?\s+/i, "");
  if (!v || /\d/.test(v)) return false;

  const norm = normalizeBceForTokenMatch(v);
  if (!norm) return false;

  const corpTokens = /\b(srl|sprl|sa|asbl|bv|bvba|scrl|sc|ltd|inc|services?|entreprise|company)\b/;
  if (corpTokens.test(norm)) return false;

  if (companyNameHint) {
    const c = normalizeBceForTokenMatch(companyNameHint);
    if (c && (norm === c || norm.includes(c) || c.includes(norm))) return false;
  }

  const parts = v.split(/\s+/).filter(Boolean);
  if (parts.length < 2 || parts.length > 4) return false;

  // Words that look like proper nouns but are NOT person names
  const NON_PERSON_NOUNS = new Set([
    "publication", "site", "page", "contenu", "information", "responsable",
    "webmaster", "redacteur", "editeur", "administrateur", "direction",
    "service", "accueil", "support", "copyright", "mentions",
  ]);

  const particles = new Set(["de", "du", "des", "la", "le", "van", "von", "di", "da"]);
  for (const p of parts) {
    const pn = normalizeBceForTokenMatch(p);
    if (particles.has(pn)) continue;
    if (NON_PERSON_NOUNS.has(pn)) return false;
    if (!/^\p{Lu}[\p{L}'\-]{1,30}$/u.test(p)) return false;
  }
  return true;
}

function isFounderPlaceholderName(name) {
  const n = normalizeBceForTokenMatch(name);
  if (!n) return true;
  if (
    /(equipe de direction|team|direction|administration|service client|support|contact|real estate agency|agence immobiliere)/.test(
      n
    )
  ) {
    return true;
  }
  // Legal-mention patterns mistakenly captured as founder names
  if (/(publication du site|responsable du site|publication|webmaster|redacteur en chef|editeur|editeur du site)/.test(n)) return true;
  return false;
}

function founderSourceScoreBonus(urlHint) {
  const s = String(urlHint || "").toLowerCase();
  let bonus = 0;
  if (/\/(about|a-propos|equipe|team|qui-sommes|notre-histoire)/.test(s)) bonus += 3;
  if (/linkedin\.com\/in\//.test(s)) bonus += 3;
  else if (/linkedin\.com/.test(s)) bonus += 2;
  return bonus;
}

function extractFounderFromText(text, { companyNameHint = null, urlHint = null } = {}) {
  const src = String(text || "");
  if (!src) return null;

  const rolePattern =
    "(fondateur(?:rice)?|co[- ]?fondateur(?:rice)?|founder|co[- ]?founder|g[eé]rant(?:e)?|gerant(?:e)?|administrateur(?:trice)?(?:\\s+delegue)?|ceo|directeur(?:trice)?(?:\\s+general(?:e)?)?|owner|proprietaire)";
  const namePattern = "(\\p{Lu}[\\p{L}'\\-]{1,30}(?:\\s+\\p{Lu}[\\p{L}'\\-]{1,30}){1,3})";

  const candidates = [];

  const roleThenName = new RegExp(`${rolePattern}\\s*(?:[:\\-,]|de|chez)?\\s*${namePattern}`, "giu");
  let m;
  while ((m = roleThenName.exec(src)) !== null) {
    const role = normalizeFounderRoleLabel(m[1]) || "Fondateur";
    const name = normSpaces(m[2]);
    if (!looksLikePersonName(name, companyNameHint)) continue;
    let score = 6 + founderSourceScoreBonus(urlHint);
    if (/fondateur|founder/i.test(role)) score += 2;
    candidates.push({ founder_name: name, founder_role: role, founder_photo_url: null, score });
  }

  const nameThenRole = new RegExp(`${namePattern}\\s*(?:,|-|\\|)\\s*${rolePattern}`, "giu");
  while ((m = nameThenRole.exec(src)) !== null) {
    const name = normSpaces(m[1]);
    const role = normalizeFounderRoleLabel(m[2]) || "Fondateur";
    if (!looksLikePersonName(name, companyNameHint)) continue;
    let score = 5 + founderSourceScoreBonus(urlHint);
    if (/fondateur|founder/i.test(role)) score += 2;
    candidates.push({ founder_name: name, founder_role: role, founder_photo_url: null, score });
  }

  if (!candidates.length) return null;
  candidates.sort((a, b) => b.score - a.score);
  return candidates[0];
}

function extractFounderPhotoFromPage($, baseUrl, founderName) {
  if (!founderName) return null;

  const tokens = normalizeBceForTokenMatch(founderName)
    .split(" ")
    .filter((x) => x.length >= 3);
  if (!tokens.length) return null;

  let picked = null;
  $("img[src]").each((_, el) => {
    if (picked) return;
    const src = normalizePublicUrl($(el).attr("src"), baseUrl);
    if (!src || isLikelyLogoUrl(src) || isBadImageUrl(src)) return;

    const alt = $(el).attr("alt") || "";
    const title = $(el).attr("title") || "";
    const cls = $(el).attr("class") || "";
    const id = $(el).attr("id") || "";
    const hint = normalizeBceForTokenMatch(`${alt} ${title} ${cls} ${id}`);
    if (!hint) return;

    const tokenMatchCount = tokens.filter((t) => hint.includes(t)).length;
    if (tokenMatchCount >= Math.min(2, tokens.length)) picked = src;
  });

  return picked;
}

function extractFounderFromJsonLd(flatJsonlds, baseUrl) {
  const pickPerson = (p, { force = false, companyNameHint = null } = {}) => {
    if (!p || typeof p !== "object") return null;
    const founder_name = normSpaces(typeof p.name === "string" ? p.name : "");
    if (!looksLikePersonName(founder_name, companyNameHint)) return null;

    const roleRaw =
      (typeof p.jobTitle === "string" && p.jobTitle) ||
      (typeof p.roleName === "string" && p.roleName) ||
      (typeof p.title === "string" && p.title) ||
      (typeof p.description === "string" && p.description) ||
      "";
    const roleDetected = normalizeFounderRoleLabel(roleRaw);
    if (!force && !roleDetected) return null;
    const founder_role = roleDetected || "Fondateur";

    let founder_photo_url = null;
    if (typeof p.image === "string") founder_photo_url = normalizePublicUrl(p.image, baseUrl);
    else if (Array.isArray(p.image)) founder_photo_url = normalizePublicUrl(p.image[0], baseUrl);
    else if (p.image && typeof p.image === "object") {
      founder_photo_url = normalizePublicUrl(p.image.url || p.image.contentUrl, baseUrl);
    }
    return { founder_name, founder_role, founder_photo_url: founder_photo_url || null };
  };

  // Pass 1: explicit founder fields.
  for (const obj of flatJsonlds || []) {
    const f = obj?.founder;
    if (!f) continue;
    if (Array.isArray(f)) {
      for (const one of f) {
        const x = pickPerson(one, { force: true });
        if (x) return x;
      }
      continue;
    }
    if (typeof f === "string") {
      const founder_name = normSpaces(f);
      if (looksLikePersonName(founder_name)) return { founder_name, founder_role: "Fondateur", founder_photo_url: null };
      continue;
    }
    const x = pickPerson(f, { force: true });
    if (x) return x;
  }

  // Pass 2: Person objects carrying a founder-like role.
  for (const obj of flatJsonlds || []) {
    const t = obj?.["@type"];
    const isPerson = Array.isArray(t)
      ? t.some((x) => normalizeBceForTokenMatch(x) === "person")
      : normalizeBceForTokenMatch(t) === "person";
    if (!isPerson) continue;
    const x = pickPerson(obj, { force: false });
    if (x) return x;
  }

  // Pass 3: Organization-linked person arrays.
  for (const obj of flatJsonlds || []) {
    const companyNameHint = normSpaces(obj?.name || "");
    const pools = [obj?.employee, obj?.member, obj?.creator, obj?.author, obj?.agent, obj?.alumni];
    for (const pool of pools) {
      const list = Array.isArray(pool) ? pool : [pool];
      for (const one of list) {
        const x = pickPerson(one, { force: false, companyNameHint });
        if (x) return x;
      }
    }
  }

  return { founder_name: null, founder_role: null, founder_photo_url: null };
}

function normalizeContactNameCandidate(raw, { companyNameHint = null } = {}) {
  let s = normSpaces(String(raw || ""));
  if (!s) return null;

  s = s.replace(/^(contact|personne de contact|responsable|attn|a l['â€™]attention de)\s*[:\-]?\s*/i, "");
  s = s.replace(/\s*\((?:tel|telephone|phone|gsm|email).*\)\s*$/i, "");
  s = s.replace(/\s*[-|]\s*(?:tel|telephone|phone|gsm|email)\b.*$/i, "");
  s = normSpaces(s.replace(/[;|]+/g, " "));
  if (!s) return null;

  return looksLikePersonName(s, companyNameHint) ? s : null;
}

function extractContactNameFromJsonLd(flatJsonlds, { companyNameHint = null } = {}) {
  for (const obj of flatJsonlds || []) {
    const cp = obj?.contactPoint;
    if (!cp) continue;

    const list = Array.isArray(cp) ? cp : [cp];
    for (const one of list) {
      const rawName =
        typeof one === "string"
          ? one
          : one && typeof one === "object" && typeof one.name === "string"
            ? one.name
            : null;
      const picked = normalizeContactNameCandidate(rawName, { companyNameHint });
      if (picked) return picked;
    }
  }
  return null;
}

function extractContactNameFromAddressText(addressText, { companyNameHint = null } = {}) {
  const src = String(addressText || "");
  if (!src) return null;

  const labeled = src.match(/(?:contact|personne de contact|responsable|attn|a l['â€™]attention de)\s*[:\-]?\s*([^\n|;,]+)/i);
  if (labeled?.[1]) {
    const picked = normalizeContactNameCandidate(labeled[1], { companyNameHint });
    if (picked) return picked;
  }

  const chunks = src
    .split(/\r?\n|[|;,]/)
    .map((x) => normSpaces(x))
    .filter(Boolean)
    .slice(0, 10);

  for (const chunk of chunks) {
    const cleaned = chunk.replace(/^(contact|personne de contact|responsable|attn|a l['â€™]attention de)\s*[:\-]?\s*/i, "");
    const picked = normalizeContactNameCandidate(cleaned, { companyNameHint });
    if (picked) return picked;
  }

  return null;
}

function detectLanguagesFromText(text) {
  const t = normalizeBceForTokenMatch(text);
  if (!t) return [];
  const out = new Set();
  if (/\b(francais|french)\b/.test(t)) out.add("français");
  if (/\b(neerlandais|nederlands|dutch)\b/.test(t)) out.add("néerlandais");
  if (/\b(anglais|english)\b/.test(t)) out.add("anglais");
  if (/\b(allemand|deutsch|german)\b/.test(t)) out.add("allemand");
  return Array.from(out);
}

function detectAvailabilityFromText(text) {
  const t = normalizeBceForTokenMatch(text);
  if (!t) return null;
  if (/(24h|24 24|24 7|7j 7|jour et nuit)/.test(t)) return "24h/24 - 7j/7";
  if (/(sur rendez vous|uniquement sur rendez vous)/.test(t)) return "Sur rendez-vous";
  if (/(du lundi au vendredi|lundi vendredi)/.test(t)) return "Du lundi au vendredi";
  return null;
}

function buildBceStatusLabel(statusLabel, juridicalSituationLabel) {
  const parts = [normSpaces(statusLabel), normSpaces(juridicalSituationLabel)].filter(Boolean);
  if (!parts.length) return null;
  return parts.join(" | ");
}

function bceStatusIsPublishable(statusText) {
  const t = normalizeBceForTokenMatch(statusText);
  if (!t) return false;
  const hasActiveToken = BCE_ACTIVE_TOKENS.some((token) => t.includes(token));
  if (!hasActiveToken) return false;
  const hasBlockingToken = BCE_BLOCKING_TOKENS.some((token) => t.includes(token));
  return !hasBlockingToken;
}

function normalizeLegalNameForLookup(s) {
  const t = normalizeBceForTokenMatch(s);
  if (!t) return null;
  return t
    .replace(/\b(srl|sprl|sa|asbl|scrl|snc|sc|sr|srlu)\b/g, " ")
    .replace(/\s+/g, " ")
    .trim();
}

function normalizeStreetForLookup(s) {
  const t = normalizeBceForTokenMatch(s);
  if (!t) return null;
  return t
    .replace(/\b(rue|avenue|av|boulevard|bd|chaussee|chemin|route|place|quai|allee|square|impasse)\b/g, " ")
    .replace(/\b\d+[a-z]?\b/g, " ")
    .replace(/\s+/g, " ")
    .trim();
}

async function loadLocalBceDataset() {
  const files = {
    meta: path.join(BCE_DIR, "meta.csv"),
    code: path.join(BCE_DIR, "code.csv"),
    enterprise: path.join(BCE_DIR, "enterprise.csv"),
    denomination: path.join(BCE_DIR, "denomination.csv"),
    address: path.join(BCE_DIR, "address.csv"),
    contact: path.join(BCE_DIR, "contact.csv"),
  };

  if (!fileExists(files.enterprise) || !fileExists(files.code)) {
    return null;
  }

  logInfo(`[BCE] loading local dataset from ${BCE_DIR} ...`);

  const codeByCategoryAndCode = new Map();
  if (fileExists(files.code)) {
    await forEachCsvRow(files.code, (row) => {
      const category = normSpaces(row[0]);
      const code = normSpaces(row[1]);
      const language = normSpaces(row[2]);
      const description = normSpaces(row[3]);
      if (!category || !code || !language || !description) return;
      const key = makeBceCodeKey(category, code);
      const entry = codeByCategoryAndCode.get(key) || {};
      entry[language.toUpperCase()] = description;
      codeByCategoryAndCode.set(key, entry);
    });
  }

  let sourceUpdateDate = null;
  if (fileExists(files.meta)) {
    await forEachCsvRow(files.meta, (row) => {
      const variable = normSpaces(row[0]);
      const value = normSpaces(row[1]);
      if (variable === "SnapshotDate") {
        sourceUpdateDate = parseDdMmYyyyToIso(value);
      }
    });
  }

  const lookupCode = (category, code) => {
    const key = makeBceCodeKey(category, code);
    return pickCodeDescription(codeByCategoryAndCode.get(key), BCE_LANGUAGE);
  };

  const enterpriseByNumber = new Map();
  await forEachCsvRow(files.enterprise, (row) => {
    const enterpriseNumber = normalizeBceDigits(row[0]);
    if (!enterpriseNumber) return;

    const statusCode = normSpaces(row[1]) || null;
    const juridicalSituationCode = normSpaces(row[2]) || null;
    const typeOfEnterpriseCode = normSpaces(row[3]) || null;
    const juridicalFormCode = normSpaces(row[4]) || null;

    const statusLabel = lookupCode("Status", statusCode) || statusCode;
    const juridicalSituationLabel = lookupCode("JuridicalSituation", juridicalSituationCode) || juridicalSituationCode;
    const typeOfEnterpriseLabel = lookupCode("TypeOfEnterprise", typeOfEnterpriseCode) || typeOfEnterpriseCode;
    const juridicalFormLabel = lookupCode("JuridicalForm", juridicalFormCode) || juridicalFormCode;

    enterpriseByNumber.set(enterpriseNumber, {
      statusCode,
      statusLabel,
      juridicalSituationCode,
      juridicalSituationLabel,
      typeOfEnterpriseCode,
      typeOfEnterpriseLabel,
      juridicalFormCode,
      juridicalFormLabel,
      startDate: parseDdMmYyyyToIso(normSpaces(row[6])) || null,
    });
  });

  const legalNameByNumber = new Map();
  const legalNameRankByNumber = new Map();
  const legalNamesByNumber = new Map();
  const numbersByLegalNameKey = new Map();
  if (fileExists(files.denomination)) {
    await forEachCsvRow(files.denomination, (row) => {
      const entityNumber = normalizeBceDigits(row[0]);
      if (!entityNumber || !enterpriseByNumber.has(entityNumber)) return;
      const languageCode = normSpaces(row[1]);
      const typeOfDenomination = normSpaces(row[2]);
      const denomination = normSpaces(row[3]);
      if (!denomination) return;

      const typeRank = typeOfDenomination === "001" ? 0 : typeOfDenomination === "002" ? 1 : typeOfDenomination === "003" ? 2 : 4;
      const langRank = languageCode === "1" ? 0 : languageCode === "2" ? 1 : languageCode === "0" ? 2 : 3;
      const rank = typeRank * 10 + langRank;

      const prevRank = legalNameRankByNumber.get(entityNumber);
      if (prevRank === undefined || rank < prevRank) {
        legalNameRankByNumber.set(entityNumber, rank);
        legalNameByNumber.set(entityNumber, denomination);
      }

      const list = legalNamesByNumber.get(entityNumber) || [];
      if (!list.includes(denomination)) list.push(denomination);
      legalNamesByNumber.set(entityNumber, list);

      const key = normalizeLegalNameForLookup(denomination);
      if (key) {
        const set = numbersByLegalNameKey.get(key) || new Set();
        set.add(entityNumber);
        numbersByLegalNameKey.set(key, set);
      }
    });
  }

  const addressByNumber = new Map();
  if (fileExists(files.address)) {
    await forEachCsvRow(files.address, (row) => {
      const entityNumber = normalizeBceDigits(row[0]);
      if (!entityNumber || !enterpriseByNumber.has(entityNumber)) return;

      const postcode = normSpaces(row[4]) || null;
      const cityFr = normSpaces(row[6]) || null;
      const streetFr = normSpaces(row[8]) || null;
      const house = normSpaces(row[9]) || null;
      const box = normSpaces(row[10]) || null;

      const streetLine = [streetFr, house, box ? `boite ${box}` : null].filter(Boolean).join(" ").trim() || null;
      const city = sanitizeCityValue(cityFr);

      if (!postcode && !city && !streetLine) return;
      addressByNumber.set(entityNumber, {
        postal_code: postcode || null,
        city: city || null,
        address: streetLine || null,
      });
    });
  }

  const contactByNumber = new Map();
  if (BCE_LOAD_CONTACT && fileExists(files.contact)) {
    await forEachCsvRow(files.contact, (row) => {
      const entityNumber = normalizeBceDigits(row[0]);
      if (!entityNumber || !enterpriseByNumber.has(entityNumber)) return;

      const contactType = normSpaces(row[2]).toUpperCase();
      const rawValue = normSpaces(row[3]);
      if (!contactType || !rawValue) return;

      const entry = contactByNumber.get(entityNumber) || {
        website: null,
        public_phone: null,
        contact_email: null,
        facebook: null,
        instagram: null,
        linkedin: null,
        tiktok: null,
      };

      if (contactType === "WEB") {
        const social = classifySocialUrl(rawValue);
        if (social && !entry[social.platform]) {
          entry[social.platform] = social.url;
        } else if (!entry.website) {
          entry.website = normalizePublicUrl(rawValue);
        }
      } else if (contactType === "EMAIL" && !entry.contact_email) {
        entry.contact_email = normalizeEmail(rawValue);
      } else if (contactType === "TEL" && !entry.public_phone) {
        entry.public_phone = normalizeBePhone(rawValue);
      }

      contactByNumber.set(entityNumber, entry);
    });
  }

  logInfo(
    `[BCE] loaded enterprise=${enterpriseByNumber.size} | legal_names=${legalNameByNumber.size} | contacts=${contactByNumber.size} | source_date=${
      sourceUpdateDate || "?"
    }`
  );

  return {
    source: "BCE_CSV",
    sourceUpdateDate: sourceUpdateDate || null,
    enterpriseByNumber,
    legalNameByNumber,
    legalNamesByNumber,
    numbersByLegalNameKey,
    addressByNumber,
    contactByNumber,
  };
}

function enrichFromLocalBce(bce10, dataset) {
  if (!dataset || !bce10) return null;
  const enterprise = dataset.enterpriseByNumber.get(bce10);
  if (!enterprise) return null;

  const bce_legal_name = dataset.legalNameByNumber.get(bce10) || null;
  const bce_legal_names = Array.isArray(dataset.legalNamesByNumber?.get(bce10)) ? dataset.legalNamesByNumber.get(bce10) : [];
  const bce_status = enterprise.statusLabel || enterprise.statusCode || null;
  const contacts = dataset.contactByNumber.get(bce10) || null;
  const addr = dataset.addressByNumber?.get(bce10) || null;

  return {
    bce_number: formatBceForUi(bce10),
    bce_status: bce_status || enterprise.statusLabel || null,
    bce_legal_name,
    bce_legal_names: bce_legal_names.length ? bce_legal_names : null,
    founded_on: enterprise.startDate || null,
    bce_type_of_enterprise: enterprise.typeOfEnterpriseLabel || null,
    bce_juridical_form: enterprise.juridicalFormLabel || null,
    bce_juridical_situation: enterprise.juridicalSituationLabel || null,
    bce_source_update_date: dataset.sourceUpdateDate || null,
    bce_source: dataset.source,
    contact_email: contacts?.contact_email || null,
    public_phone: contacts?.public_phone || null,
    website: contacts?.website || null,
    facebook: contacts?.facebook || null,
    instagram: contacts?.instagram || null,
    linkedin: contacts?.linkedin || null,
    tiktok: contacts?.tiktok || null,
    address: addr?.address || null,
    postal_code: addr?.postal_code || null,
    city: addr?.city || null,
  };
}

function recoverBceFromLocalIdentity({ dataset, companyName, address, postal_code, city }) {
  if (!dataset) return null;

  const nameKey = normalizeLegalNameForLookup(companyName);
  if (!nameKey) return null;

  const exact = dataset.numbersByLegalNameKey?.get(nameKey);
  let candidates = exact ? Array.from(exact) : [];

  // fallback: fuzzy include on legal-name keys
  if (!candidates.length) {
    for (const [k, set] of dataset.numbersByLegalNameKey || []) {
      if (!k) continue;
      if (!k.includes(nameKey) && !nameKey.includes(k)) continue;
      for (const b of set) candidates.push(b);
      if (candidates.length > 500) break;
    }
  }

  candidates = Array.from(new Set(candidates));
  if (!candidates.length) return null;

  const wantedPostal = normSpaces(postal_code);
  const wantedCity = sanitizeCityValue(city);
  const wantedStreet = normalizeStreetForLookup(address);

  let best = null;
  for (const bce10 of candidates) {
    const info = enrichFromLocalBce(bce10, dataset);
    if (!info) continue;

    let score = 0;
    const legalKey = normalizeLegalNameForLookup(info.bce_legal_name || "");
    let exactNameMatch = false;
    let partialNameMatch = false;
    if (legalKey === nameKey) {
      score += 60;
      exactNameMatch = true;
    } else if (legalKey && (legalKey.includes(nameKey) || nameKey.includes(legalKey))) {
      score += 35;
      partialNameMatch = true;
    }

    let postalMatch = false;
    let cityMatch = false;
    let streetMatch = false;

    if (wantedPostal && info.postal_code && wantedPostal === info.postal_code) {
      score += 28;
      postalMatch = true;
    }
    if (wantedCity && info.city && sameCity(wantedCity, info.city)) {
      score += 22;
      cityMatch = true;
    }

    if (wantedStreet && info.address) {
      const street = normalizeStreetForLookup(info.address);
      if (street && wantedStreet && (street.includes(wantedStreet) || wantedStreet.includes(street))) {
        score += 20;
        streetMatch = true;
      }
    }

    if (!bceStatusIsPublishable(info.bce_status || "")) score -= 40;

    if (!best || score > best.score) {
      best = { bce10, score, info, exactNameMatch, partialNameMatch, postalMatch, cityMatch, streetMatch };
    }
  }

  if (!best) return null;
  if (best.score < 55) return null;
  // Guardrail: partial-name + city-only matches are too weak and create false positives.
  if (!best.exactNameMatch && !best.postalMatch && !best.streetMatch) return null;
  return best;
}

// Extraction candidates : BE 0xxx.xxx.xxx / BE 1xxx.xxx.xxx / 0xxx.xxx.xxx / 1xxx.xxx.xxx
function extractBceCandidates(text) {
  const t = String(text || "");
  const re = /(?:BE\s*)?([01]\d{3}[\s.\-]?\d{3}[\s.\-]?\d{3})/gi;
  const matches = t.match(re) || [];
  const cleaned = matches.map((m) => normalizeBceDigits(m)).filter(Boolean);
  return Array.from(new Set(cleaned));
}

// Mod97 : 97 - (first8 % 97) == last2
function isValidBceModulo97(bce10) {
  if (!/^\d{10}$/.test(String(bce10 || ""))) return false;
  const first8 = parseInt(String(bce10).slice(0, 8), 10);
  const check = parseInt(String(bce10).slice(8, 10), 10);
  const expected = 97 - (first8 % 97);
  return check === expected;
}

// -------------------- EMAIL --------------------
const PUBLIC_EMAIL_PROVIDERS = new Set([
  "gmail.com",
  "hotmail.com",
  "outlook.com",
  "live.com",
  "yahoo.com",
  "icloud.com",
  "skynet.be",
  "proximus.be",
  "telenet.be",
]);
const BLOCKED_EMAIL_DOMAINS = new Set([
  "spw.wallonie.be",
  "wallonie.be",
  "belgium.be",
  "fgov.be",
  "gov.be",
]);
const BLOCKED_EMAIL_LOCALPART_RE =
  /(^|[._+\-])(job|jobs|career|careers|recruit(?:ment)?|recrut(?:ement)?|candidature|emploi|hiring|cv|rh|hr)([._+\-]|$)/;

function normalizeEmail(raw) {
  if (!raw) return null;
  let s = String(raw).trim().toLowerCase();
  if (!s) return null;
  if (s.startsWith("mailto:")) s = s.slice("mailto:".length);
  s = s.split("?")[0] || s; // mailto params
  s = s.replace(/[)\]}>.,;:]+$/g, ""); // common trailing punctuation in snippets
  s = s.replace(/(?:\s|,|;|\/|\||-)*(tva|vat)\s*$/i, "");
  s = s.trim();
  if (!s) return null;
  if (!/^[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}$/.test(s)) return null;
  let [local, domain] = s.split("@");
  if (!local || !domain) return null;

  // Common crawl artifact: "emailinfo@domain.tld" -> "info@domain.tld".
  if (/^email(?:info|contact|devis|support|service|hello|admin)\b/.test(local)) {
    local = local.replace(/^email/, "");
  }
  local = local.replace(/^e-?mail(?=[a-z0-9])/i, "");
  if (!local) return null;
  s = `${local}@${domain}`;

  if (/\.be(?:tva|vat)$/i.test(domain)) return null;
  return s;
}

function extractEmailsFromText(text) {
  const t = String(text || "");
  const re = /[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}/gi;
  const found = t.match(re) || [];
  return Array.from(new Set(found.map((e) => e.toLowerCase())));
}

function extractEmailsFromMailto($) {
  const out = [];
  $("a[href]").each((_, el) => {
    const href = String($(el).attr("href") || "").trim();
    if (!href.toLowerCase().startsWith("mailto:")) return;
    const addrPart = href.slice("mailto:".length).split("?")[0] || "";
    const decoded = (() => {
      try {
        return decodeURIComponent(addrPart);
      } catch {
        return addrPart;
      }
    })();
    decoded
      .split(/[;,]/g)
      .map((s) => normalizeEmail(s))
      .filter(Boolean)
      .forEach((e) => out.push(e));
  });
  return Array.from(new Set(out));
}

function extractEmailsFromJsonLd(jsonlds) {
  const out = [];
  for (const o of jsonlds || []) {
    const email = o?.email;
    if (!email) continue;
    if (typeof email === "string") {
      const e = normalizeEmail(email);
      if (e) out.push(e);
      continue;
    }
    if (Array.isArray(email)) {
      for (const x of email) {
        const e = normalizeEmail(x);
        if (e) out.push(e);
      }
    }
  }
  return Array.from(new Set(out));
}

function isHumanLocalPart(local) {
  if (!local) return false;
  const s = local.toLowerCase();
  if (!/^[a-z0-9._+\-]+$/.test(s)) return false;
  if (/^(contact|info|hello|support|webmaster|noreply|no-reply|admin)$/.test(s)) return false;
  if (/^[a-z]{2,}$/.test(s)) return true; // prenom@
  if (/^[a-z]\.[a-z]{2,}$/.test(s)) return true; // p.nom@
  if (/^[a-z]{2,}\.[a-z]{2,}$/.test(s)) return true; // prenom.nom@
  if (/^[a-z]{2,}[-_][a-z]{2,}$/.test(s)) return true; // prenom-nom@
  return false;
}

function scoreEmail(email, websiteUrl) {
  const e = normalizeEmail(email);
  if (!e) return -1e9;

  const dom = domainOf(websiteUrl);
  const [local, domain] = e.split("@");

  let s = 0;
  if (BLOCKED_EMAIL_DOMAINS.has(domain)) return -1e9;
  if (Array.from(BLOCKED_EMAIL_DOMAINS).some((d) => domain.endsWith("." + d))) return -1e9;
  if (dom && domain === dom) s += 50;
  if (dom && domain.endsWith("." + dom)) s += 40;
  if (dom && domain !== dom && !domain.endsWith("." + dom) && !PUBLIC_EMAIL_PROVIDERS.has(domain)) s -= 18;

  const localLower = local.toLowerCase();
  if (BLOCKED_EMAIL_LOCALPART_RE.test(localLower)) return -1e9;

  const human = isHumanLocalPart(localLower);
  if (human) s += 15;

  if (/(direction|gerance|administration|admin|owner|ceo|manager)/.test(localLower)) s += 20;
  if (/(contact|info|hello)/.test(localLower)) s += 10;
  if (/^(info|contact|devis|hello)$/.test(localLower)) s += 4;

  if (PUBLIC_EMAIL_PROVIDERS.has(domain) && human) s += 8;

  if (/^email(?:info|contact|devis|support|service|hello|admin)/.test(localLower)) s -= 12;
  if (/(webmaster|support|noreply|no-reply|dev|agency|studio|marketing)/.test(localLower)) s -= 20;

  return s;
}

function pickBestEmail(emails, websiteUrl) {
  const dom = domainOf(websiteUrl);
  const cleanedSet = new Set((emails || []).map(normalizeEmail).filter(Boolean));
  if (dom) {
    // Crawl artifact repair: "info@domain.bebce" / "contact@domain.bet" -> "...@domain.be"
    for (const e of Array.from(cleanedSet)) {
      const at = e.lastIndexOf("@");
      if (at <= 0) continue;
      const local = e.slice(0, at);
      const domain = e.slice(at + 1);
      if (!domain || domain === dom) continue;
      const trailing = domain.startsWith(dom) ? domain.slice(dom.length) : "";
      if (/^[a-z]{1,12}$/.test(trailing)) cleanedSet.add(`${local}@${dom}`);
    }
  }
  const cleaned = Array.from(cleanedSet);
  if (!cleaned.length) return null;

  let best = null;
  let bestScore = -1e9;
  for (const e of cleaned) {
    const s = scoreEmail(e, websiteUrl);
    if (s > bestScore) {
      best = e;
      bestScore = s;
    }
  }
  return bestScore >= -5 ? best : null;
}

// -------------------- PHONE (PUBLIC) --------------------
// Goal: a single "public" phone, ideally normalized to +32... (E.164-ish).
function normalizeBePhone(raw) {
  if (!raw) return null;
  let s = String(raw).trim();
  if (!s) return null;

  const lower = s.toLowerCase();
  if (lower.startsWith("tel:")) s = s.slice("tel:".length);
  if (lower.startsWith("callto:")) s = s.slice("callto:".length);

  s = s.split("?")[0] || s; // tel: params
  s = s.replace(/\(0\)/g, "");
  s = s.replace(/^\s*(téléphone|tél|tel|phone|gsm)\s*[:\-]?\s*/i, "");
  s = s.replace(/\b(ext\.?|poste|extension)\b.*$/i, "").trim();
  if (!s) return null;

  const hasPlus = s.includes("+");
  let digits = s.replace(/\D/g, "");
  if (!digits) return null;

  // 0032... => 32...
  if (!hasPlus && digits.startsWith("0032")) digits = digits.slice(2);

  // +32... / 32...
  if (digits.startsWith("32")) {
    let rest = digits.slice(2);
    if (rest.startsWith("0")) rest = rest.slice(1);
    if (rest.length >= 8 && rest.length <= 9) return `+32${rest}`;
    return null;
  }

  // National format: 0XXXXXXXX or 0XXXXXXXXX
  if (digits.startsWith("0") && (digits.length === 9 || digits.length === 10)) {
    // Avoid confusing BCE numbers as phones.
    if (digits.length === 10 && isValidBceModulo97(digits)) return null;
    return `+32${digits.slice(1)}`;
  }

  return null;
}

function extractPhonesFromText(text) {
  const t = String(text || "");
  if (!t) return [];

  const re = /(?:\+32|0032|0)\s*(?:\(0\)\s*)?(?:\d[\s.\-()/]{0,3}){7,14}\d/g;
  const found = t.match(re) || [];
  const out = [];
  for (const m of found) {
    const n = normalizeBePhone(m);
    if (n) out.push(n);
    if (out.length >= 40) break;
  }
  return Array.from(new Set(out));
}

function extractPhonesFromTelLinks($) {
  const out = [];
  $("a[href]").each((_, el) => {
    const href = String($(el).attr("href") || "").trim();
    if (!href) return;
    const h = href.toLowerCase();
    if (!h.startsWith("tel:") && !h.startsWith("callto:")) return;
    const n = normalizeBePhone(href);
    if (n) out.push(n);
  });
  return Array.from(new Set(out));
}

function extractPhonesFromJsonLd(jsonlds) {
  const out = [];

  const add = (v) => {
    if (!v) return;
    if (typeof v === "string") {
      const n = normalizeBePhone(v);
      if (n) out.push(n);
      return;
    }
    if (Array.isArray(v)) {
      for (const x of v) add(x);
      return;
    }
    if (typeof v === "object") {
      if (typeof v["@value"] === "string") add(v["@value"]);
    }
  };

  for (const o of jsonlds || []) {
    add(o?.telephone);

    const cp = o?.contactPoint;
    if (Array.isArray(cp)) {
      for (const c of cp) add(c?.telephone);
    } else if (cp && typeof cp === "object") {
      add(cp?.telephone);
    }
  }

  return Array.from(new Set(out));
}

function sanitizeCityValue(raw, { fallback = null } = {}) {
  let s = normSpaces(raw);
  if (!s) s = normSpaces(fallback);
  if (!s) return null;

  // Remove glued footer/contact artifacts (e.g. "Liègecontactez-nousgsm").
  {
    const NOISE_FRAGMENT_RE =
      /(contactez[-\s]?nous|contact|infos?|gsm|t[eé]l(?:[eé]phone)?|phone|email|e-?mail|siegesocial|si[eè]gesocial|tva|vat|mentions?|presentation|copyright)/i;
    const noiseHit = s.match(NOISE_FRAGMENT_RE);
    if (noiseHit && typeof noiseHit.index === "number" && noiseHit.index > 0) {
      s = s.slice(0, noiseHit.index).trim();
    }
  }

  s = s.replace(/[|,;/]+/g, " ").trim();
  s = s.replace(/\b(e-?mail|email|siege social|si[eè]ge social|si[eè]ge|social|tva|tvat|tel|telephone|phone|contact)\b.*$/i, "").trim();
  s = s.replace(/\b(belgique|belgium|be)\b.*$/i, "").trim();
  // JS \b doesn't see non-ASCII letters as word chars, so "Liègebe" is not caught above.
  // Explicitly strip "be" glued after a non-ASCII letter (accented French city names).
  s = s.replace(/[\u00C0-\u024F]be$/i, (m) => m[0]).trim();
  s = s.replace(/[0-9]+.*$/g, "").trim();
  s = s.replace(/\s{2,}/g, " ").trim();
  if (!s) return null;

  const CITY_NOISE_TOKENS = new Set([
    "numero", "num", "no", "info", "contact", "belgiquecontact", "cedex", "province",
    // Trade/service keywords — never valid inside a city name
    "plombier", "plomberie", "debouchage", "chauffage", "chauffagiste", "sanitaire",
    "tarif", "tarifs", "urgence", "urgences", "service", "services",
    "intervention", "interventions", "devis", "entreprise", "societe",
    "reparation", "reparations", "electricien", "electricite", "toiture",
    "couvreur", "peintre", "peinture", "menuisier", "menuiserie",
    "maconnerie", "macon", "renovation", "nettoyage", "jardinage",
    "serrurier", "serrurerie", "vitrier", "vitrerie", "carreleur",
    "carrelage", "isolation", "demenagement", "ramonage",
    "competitif", "professionnel", "expert", "specialiste",
  ]);
  let words = s.split(/\s+/).filter(Boolean);
  words = words.filter((w) => {
    const token = normalizeBceForTokenMatch(w).replace(/[^a-z0-9]/g, "");
    return token && !CITY_NOISE_TOKENS.has(token);
  });

  while (words.length > 1) {
    const last = normalizeBceForTokenMatch(words[words.length - 1]).replace(/[^a-z0-9]/g, "");
    if (CITY_NOISE_TOKENS.has(last) || /^[a-z]$/.test(last)) {
      words.pop();
      continue;
    }
    break;
  }
  if (!words.length) return null;

  words = words.map((w) => {
    if (/^[\p{L}'\-]+$/u.test(w)) {
      return w.slice(0, 1).toUpperCase() + w.slice(1).toLowerCase();
    }
    return w;
  });
  const out = words.join(" ").replace(/\s*-\s*/g, "-").trim();
  if (!out) return null;
  if (normalizeCityName(out).length < 3) return null;
  return out || null;
}

function isLikelyNoisyCityLabel(raw) {
  const s = normSpaces(raw);
  if (!s) return true;
  if (/\d/.test(s)) return true;

  const norm = normalizeBceForTokenMatch(s);
  if (!norm) return true;

  const words = norm.split(" ").filter(Boolean);
  if (!words.length) return true;
  if (words.length > 4) return true;
  if (words.some((w) => w.length > 24)) return true;
  if (/\b(?:numero|info|contact|cedex|belgiquecontact)\b/.test(norm)) return true;
  if (words.length > 1 && words[words.length - 1].length === 1) return true;

  // City labels should not contain trade/service keywords.
  if (
    /\b(?:plombier|debouchage|chauffag|sanitaire|tarif|urgence|service|intervention|devis|contact|entreprise|societe|reparation)\b/.test(
      norm
    )
  ) {
    return true;
  }

  return false;
}

function sanitizeAddressText(address, { companyName = null } = {}) {
  if (!address) return null;
  let s = normSpaces(String(address));
  if (!s) return null;

  s = s.replace(/<[^>]+>/g, " ");

  // Remove email and common phone segments that sometimes get stuck in <address> blocks.
  s = s.replace(/[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}/gi, " ");
  s = s.replace(/\b(?:t[eé]l[eé]phone|t[eé]l|tel|phone|gsm)\b\s*[:\-]?\s*(?:\+32|0032|0)[\d\s().\/-]{6,}\d/gi, " ");
  s = s.replace(/(?:\+32|0032)\s*(?:\(0\)\s*)?(?:\d[\s.\-()/]{0,3}){7,14}\d/g, " ");
  s = s.replace(/\b(?:tva|vat)\s*[:#\-]?\s*(?:be\s*)?0?\d{4}[.\s]?\d{3}[.\s]?\d{3}\b/gi, " ");
  s = s.replace(/\bbe\s*0?\d{4}[.\s]?\d{3}[.\s]?\d{3}\b/gi, " ");
  s = s.replace(/\b0\d{1,2}\s?\d{2,3}\s?\d{2}\s?\d{2}\b/g, " ");
  s = s.replace(/\b\d{2}\s?\d{3}\s?\d{2}\s?\d{2}\b/g, " ");
  s = s.replace(/\b(?:presentation|présentation|plan du site|page d['’]accueil|copyright|tous droits r[ée]serv[ée]s?|mentions l[ée]gales?).*$/i, " ");

  // Strip common labels/noise from contact pages and footers.
  s = s.replace(
    /\b(adresse|nos bureaux|si[eè]ge(?: social| d'exploitation)?|sur rendez-vous|sur rdv|bureau)\b\s*[:\-]?\s*/gi,
    " "
  );
  s = s.replace(/\b(?:belgique|belgium)\b\s*$/i, "");
  s = s.replace(/\(\s*(?:belgique|be)\s*\)\s*$/i, "");

  const lower = normalizeBceForTokenMatch(s);
  if (
    /(cookies|optimise par|optimisee par|webnode|remplir les champs|traitement des donnees personnelles|privacy|mentions legales)/.test(
      lower
    )
  ) {
    return null;
  }

  if (companyName) {
    const c = normalizeBceForTokenMatch(companyName);
    const sNorm = normalizeBceForTokenMatch(s);
    if (c && sNorm.startsWith(c)) {
      s = s.replace(new RegExp(`^${escapeRegex(companyName)}\\s*[,:;\\-|/]*\\s*`, "i"), "");
    }
  }

  // If extra copy exists before the actual street, keep the street segment.
  const streetMatch = s.match(/\b(rue|avenue|av\.|boulevard|bd|chaussée|chaussee|chemin|place|quai|route|impasse|allée|allee|square)\b/i);
  if (streetMatch && typeof streetMatch.index === "number" && streetMatch.index > 0 && streetMatch.index < 160) {
    s = s.slice(streetMatch.index);
  }

  s = s.replace(/\s*,\s*,+/g, ", ");
  s = normSpaces(s.replace(/[,;|]+/g, ", "));
  s = s.replace(/^[,;:\s]+|[,;:\s]+$/g, "");
  return s || null;
}

function stripPostalCityFromAddress(address, postal_code, city) {
  const original = normSpaces(String(address || ""));
  if (!original) return null;

  let s = original;
  const pc = postal_code ? String(postal_code).trim() : "";
  const c = city ? normSpaces(String(city)) : "";

  if (pc && c) {
    const rePcCity = new RegExp(`\\b${escapeRegex(pc)}\\s+${escapeRegex(c)}\\b`, "gi");
    s = s.replace(rePcCity, " ");
  }
  if (pc) {
    const rePc = new RegExp(`\\b${escapeRegex(pc)}\\b`, "g");
    s = s.replace(rePc, " ");
  }
  if (c) {
    const reCityTail = new RegExp(`[\\s,;|\\-]+${escapeRegex(c)}\\s*$`, "i");
    s = s.replace(reCityTail, " ");
  }

  s = normSpaces(s.replace(/[,;|]+/g, ", "));
  s = s.replace(/^[,;:\s]+|[,;:\s]+$/g, "");

  // If stripping removed everything, keep the original (better than NULL for UI).
  return s || original;
}

// -------------------- IMAGES --------------------
function isLikelyLogoUrl(u) {
  const s = (u || "").toLowerCase();
  if (s.includes("logo") || s.includes("favicon")) return true;
  // Use word-boundary check for "icon" to avoid false positives
  // (e.g. "intervention-icons.jpg" should NOT be rejected)
  if (/\bicon[s]?\b/.test(s) && !s.includes("iconic")) return true;
  return false;
}

function isBadImageUrl(u) {
  const s = (u || "").toLowerCase();
  if (!s) return true;
  if (s.includes("data:image")) return true;
  if (s.endsWith(".svg") || s.endsWith(".ico")) return true;
  if (s.includes("sprite")) return true;
  return false;
}

function isBadLogoUrl(u) {
  const s = (u || "").toLowerCase();
  if (!s) return true;
  if (s.includes("data:image")) return true;
  if (s.endsWith(".ico")) return true;
  if (s.includes("sprite")) return true;
  if (s.includes("favicon")) return true;
  return false;
}

function canonicalImageUrlKey(raw) {
  const u = safeUrl(raw);
  if (!u) return null;
  try {
    const parsed = new URL(u);
    const host = parsed.hostname.toLowerCase().replace(/^www\./, "");
    let pathname = String(parsed.pathname || "/")
      .replace(/\/+/g, "/")
      .replace(/\/$/g, "")
      .toLowerCase();
    pathname = pathname
      .replace(/-\d{2,5}x\d{2,5}(?=\.[a-z0-9]+$)/i, "")
      .replace(/_\d{2,5}x\d{2,5}(?=\.[a-z0-9]+$)/i, "");
    return `${host}${pathname}`;
  } catch {
    return String(u).toLowerCase();
  }
}

function dedupeImageUrls(urls, { exclude = [] } = {}) {
  const out = [];
  const seen = new Set();
  const blocked = new Set((exclude || []).map((u) => canonicalImageUrlKey(u)).filter(Boolean));

  for (const raw of urls || []) {
    const u = safeUrl(raw);
    if (!u) continue;
    const key = canonicalImageUrlKey(u) || u;
    if (blocked.has(key) || seen.has(key)) continue;
    seen.add(key);
    out.push(u);
  }
  return out;
}

function imageSemanticScore({ src, alt, title, cls, id, pageUrl, contextHint }) {
  const baseHint = normalizeBceForTokenMatch(
    `${src || ""} ${alt || ""} ${title || ""} ${cls || ""} ${id || ""} ${contextHint || ""}`
  );
  const pageHint = normalizeBceForTokenMatch(pageUrl || "");
  let score = 0;

  // Positive signals: human/team/work/proof-like visuals.
  if (/(equipe|team|portrait|fondateur|founder|artisan|technicien|personnel|staff|nous)/.test(baseHint)) score += 12;
  if (/(chantier|intervention|atelier|realisation|avant apres|projet|portfolio|galerie)/.test(baseHint)) score += 10;
  if (/(client|temoignage|avis|sourire|uniforme|tenue|camion|vehicule)/.test(baseHint)) score += 8;
  if (/(hero|banner|cover|showcase)/.test(baseHint)) score += 4;

  // Negative signals: decorative / stock-like / utility assets.
  if (/(stock|shutterstock|getty|istock|adobe stock|pixabay|unsplash|freepik)/.test(baseHint)) score -= 16;
  if (/(placeholder|default|dummy|coming soon|no image|avatar)/.test(baseHint)) score -= 12;
  if (/(logo|favicon|icon|sprite|pattern|background|bg )/.test(baseHint)) score -= 22;
  if (/(header|navbar|topbar|menu|footer|breadcrumb)/.test(baseHint)) score -= 8;
  if (/(mentions|legal|privacy|cookies)/.test(pageHint)) score -= 4;

  return score;
}

function pickFromSrcset(srcset) {
  const raw = String(srcset || "").trim();
  if (!raw) return null;
  const entries = raw
    .split(",")
    .map((x) => String(x || "").trim())
    .map((chunk) => {
      const m = chunk.match(/^(\S+)(?:\s+(\d+)w)?/);
      if (!m) return null;
      return { url: m[1], w: m[2] ? Number(m[2]) : 0 };
    })
    .filter(Boolean);
  if (!entries.length) return null;
  entries.sort((a, b) => b.w - a.w);
  return entries[0].url || null;
}

function extractImageCandidateUrlsFromHtmlRaw(html, pageUrl) {
  const src = String(html || "");
  if (!src) return [];
  const out = [];
  const push = (u) => {
    const abs = absolutize(u, pageUrl);
    if (!abs || isBadImageUrl(abs)) return;
    out.push(abs);
  };

  const attrUrlRe = /\b(?:src|data-src|data-lazy-src|data-original|poster)\s*=\s*["']([^"'<>]+)["']/gi;
  let m;
  while ((m = attrUrlRe.exec(src)) !== null) {
    push(m[1]);
  }

  const srcsetRe = /\b(?:srcset|data-srcset)\s*=\s*["']([^"']+)["']/gi;
  while ((m = srcsetRe.exec(src)) !== null) {
    const picked = pickFromSrcset(m[1]);
    if (picked) push(picked);
  }

  const cssUrlRe = /url\(\s*['"]?([^'")<>]+\.(?:avif|webp|png|jpe?g|gif))(?:\?[^'")<>]*)?['"]?\s*\)/gi;
  while ((m = cssUrlRe.exec(src)) !== null) {
    push(m[1]);
  }

  return dedupeImageUrls(out).slice(0, 80);
}

function extractLogoCandidatesFromPage($, pageUrl) {
  const out = [];
  const push = (u, bonus = 0) => {
    const abs = absolutize(u, pageUrl);
    if (!abs || isBadLogoUrl(abs)) return;
    out.push({ url: abs, bonus });
  };

  const ogLogo = $('meta[property="og:logo"]').attr("content");
  const itempropLogo = $('meta[itemprop="logo"]').attr("content");
  if (ogLogo) push(ogLogo, 35);
  if (itempropLogo) push(itempropLogo, 28);

  // apple-touch-icon: high-quality logo (≥180px), ideal candidate
  $('link[rel="apple-touch-icon"], link[rel="apple-touch-icon-precomposed"]').each((_, el) => {
    const href = $(el).attr("href");
    if (href) push(href, 38);
  });

  $("img").each((_, el) => {
    const src =
      $(el).attr("src") ||
      $(el).attr("data-src") ||
      $(el).attr("data-lazy-src") ||
      pickFromSrcset($(el).attr("srcset"));
    if (!src) return;

    const alt = String($(el).attr("alt") || "");
    const title = String($(el).attr("title") || "");
    const cls = String($(el).attr("class") || "");
    const id = String($(el).attr("id") || "");
    const hint = `${src} ${alt} ${title} ${cls} ${id}`.toLowerCase();

    let bonus = 0;
    if (/(logo|site-logo|custom-logo|brand|navbar-brand|header-logo)/i.test(hint)) bonus += 28;
    if ($(el).closest("header, nav, .site-header, .main-header, .navbar, .topbar").length) bonus += 16;
    if ($(el).closest("a[href='/'], a[href='./'], a[href=''], a[href*='home']").length) bonus += 10;

    const w = Number($(el).attr("width") || 0);
    const h = Number($(el).attr("height") || 0);
    if (w && h && w >= 70 && h >= 20 && w <= 1600 && h <= 700) bonus += 6;

    if (/favicon|icon/.test(hint)) bonus -= 25;

    if (bonus >= 8 || /logo|brand/.test(hint)) push(src, bonus);
  });

  return out;
}

function absolutize(url, base) {
  try {
    return new URL(url, base).toString();
  } catch {
    return null;
  }
}

async function probeRemoteImage(url) {
  try {
    const res = await fetchWithTimeout(url, {
      redirect: "follow",
      headers: {
        "User-Agent": USER_AGENT,
        Accept: "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
      },
    });
    if (!res.ok || !res.body) return null;
    const nodeStream = Readable.fromWeb(res.body);
    const info = await probe(nodeStream);
    nodeStream.destroy();
    return { width: info.width, height: info.height, type: info.type };
  } catch {
    return null;
  }
}

async function pickBestLogoUrl(logoCandidates, fallbackLogoUrl) {
  const bonusByUrl = new Map();
  const add = (u, bonus = 0) => {
    const safe = safeUrl(u);
    if (!safe || isBadLogoUrl(safe)) return;
    bonusByUrl.set(safe, (bonusByUrl.get(safe) || 0) + Number(bonus || 0));
  };

  for (const x of logoCandidates || []) add(x?.url, x?.bonus || 0);
  if (fallbackLogoUrl) add(fallbackLogoUrl, 20);

  const ranked = Array.from(bonusByUrl.entries())
    .sort((a, b) => b[1] - a[1])
    .slice(0, 24);
  if (!ranked.length) return fallbackLogoUrl || null;

  const limit = pLimit(6);
  const probed = await Promise.all(
    ranked.map(([url, bonus]) =>
      limit(async () => {
        const isSvg = /\.svg(\?|$)/i.test(url);
        const info = isSvg ? null : await probeRemoteImage(url);

        // SVG logos: can't probe dimensions, but are often the real header logo.
        // Give them a fair score based on bonus + URL signals.
        if (!info) {
          let score = bonus * 100_000;
          if (/logo|brand/.test(url.toLowerCase())) score += 60_000;
          if (isSvg) score += 80_000; // SVGs are typically the real vector logo
          if (/apple-touch-icon/.test(url.toLowerCase())) score += 40_000;
          return { url, bonus, score };
        }

        const w = Number(info.width || 0);
        const h = Number(info.height || 0);
        const area = w * h;
        const ratio = h ? w / h : 0;
        const isLogoLikeRatio = ratio >= 1.1 && ratio <= 8;
        const tiny = w < 70 || h < 20 || area < 8_000;
        const iconish = ratio < 0.95 && area < 160_000;

        let score = bonus * 100_000 + Math.min(area, 1_600_000);
        if (isLogoLikeRatio) score += 120_000;
        if (tiny) score -= 350_000;
        if (iconish) score -= 160_000;
        if (/logo|brand/.test(url.toLowerCase())) score += 60_000;
        if (/apple-touch-icon/.test(url.toLowerCase())) score += 40_000;

        return { url, bonus, score };
      })
    )
  );

  probed.sort((a, b) => b.score - a.score);
  return probed[0]?.url || ranked[0][0] || fallbackLogoUrl || null;
}

async function isLikelyWhiteLogoBuffer(buffer) {
  try {
    const { data, info } = await sharp(buffer).ensureAlpha().raw().toBuffer({ resolveWithObject: true });
    const channels = Number(info?.channels || 4);
    if (!data?.length || channels < 3) return false;

    let visible = 0;
    let nearWhite = 0;
    let dark = 0;

    for (let i = 0; i + 2 < data.length; i += channels) {
      const r = data[i];
      const g = data[i + 1];
      const b = data[i + 2];
      const a = channels >= 4 ? data[i + 3] : 255;
      if (a < 16) continue;
      visible += 1;

      const max = Math.max(r, g, b);
      const min = Math.min(r, g, b);
      const lum = 0.2126 * r + 0.7152 * g + 0.0722 * b;
      // Threshold 228 catches off-white colors (#E0E0E0 style) in addition to pure white.
      if (lum >= 228 && max - min <= 30) nearWhite += 1;
      if (lum <= 120) dark += 1;
    }

    // Only reject completely empty/unparseable images (< 20 visible pixels), not sparse logos.
    if (visible < 20) return true;
    const whiteRatio = nearWhite / visible;
    const darkRatio = dark / visible;
    // 0.85 threshold: logo is white/near-white if 85%+ pixels are near-white with < 5% dark pixels.
    return whiteRatio >= 0.85 && darkRatio <= 0.05;
  } catch {
    return false;
  }
}

// Detects logos with a dark background (e.g. white text on black/dark bg).
// These render as a dark rectangle on a white page — not ideal.
// Returns true if 45%+ of visible pixels are dark (lum ≤ 80).
async function isLikelyDarkBackgroundLogo(buffer) {
  try {
    const { data, info } = await sharp(buffer).ensureAlpha().raw().toBuffer({ resolveWithObject: true });
    const channels = Number(info?.channels || 4);
    if (!data?.length || channels < 3) return false;
    let visible = 0;
    let dark = 0;
    for (let i = 0; i + 2 < data.length; i += channels) {
      const a = channels >= 4 ? data[i + 3] : 255;
      if (a < 16) continue;
      visible++;
      const lum = 0.2126 * data[i] + 0.7152 * data[i + 1] + 0.0722 * data[i + 2];
      if (lum <= 80) dark++;
    }
    if (visible < 20) return false;
    return dark / visible >= 0.45;
  } catch {
    return false;
  }
}

// Detects logos/graphics with a transparent background (>25% transparent pixels).
// Real photos never have significant transparency — only PNG/SVG logos do.
// Used to reject logo-graphics from the cover/gallery slots.
async function isLikelyTransparentGraphic(buffer) {
  try {
    const meta = await sharp(buffer).metadata();
    if (!meta.hasAlpha) return false;
    const { data, info } = await sharp(buffer).ensureAlpha().raw().toBuffer({ resolveWithObject: true });
    const channels = Number(info?.channels || 4);
    let total = 0, transparent = 0;
    for (let i = 0; i + 2 < data.length; i += channels) {
      total++;
      if (data[i + 3] < 16) transparent++;
    }
    return total > 0 && transparent / total >= 0.25;
  } catch {
    return false;
  }
}

async function downloadAndConvertWebp(url, maxBytes = 4_000_000) {
  const retries = 2;
  for (let attempt = 0; attempt <= retries; attempt++) {
    try {
      const res = await fetchWithTimeout(
        url,
        {
          redirect: "follow",
          headers: {
            "User-Agent": USER_AGENT,
            Accept: "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
          },
        },
        FETCH_TIMEOUT_MS
      );
      if (!res.ok) {
        if (attempt < retries) {
          await sleep(200 * (attempt + 1));
          continue;
        }
        return null;
      }

      const ct = String(res.headers.get("content-type") || "").toLowerCase();
      if (ct && !ct.startsWith("image/")) {
        if (attempt < retries) {
          await sleep(200 * (attempt + 1));
          continue;
        }
        return null;
      }

      const len = Number(res.headers.get("content-length") || 0);
      if (len && len > maxBytes) return null;

      const buf = Buffer.from(await res.arrayBuffer());
      if (buf.length > maxBytes) return null;
      try {
        return await sharp(buf).webp({ quality: 78 }).toBuffer();
      } catch {
        if (attempt < retries) {
          await sleep(200 * (attempt + 1));
          continue;
        }
        return null;
      }
    } catch {
      if (attempt < retries) {
        await sleep(200 * (attempt + 1));
        continue;
      }
      return null;
    }
  }
  return null;
}

async function pickTopImages(imageCandidates, logoUrl, { visionScores = null } = {}) {
  const bonusByUrl = new Map();
  const semanticByUrl = new Map();
  for (const x of imageCandidates || []) {
    if (!x?.url) continue;
    bonusByUrl.set(x.url, (bonusByUrl.get(x.url) || 0) + Number(x.bonus || 0));
    semanticByUrl.set(x.url, (semanticByUrl.get(x.url) || 0) + Number(x.semantic || 0));
  }

  const filtered = (imageCandidates || [])
    .map((x) => x.url)
    .filter((u) => u && !isBadImageUrl(u))
    .filter((u) => !isLikelyLogoUrl(u))
    .filter((u) => !logoUrl || u !== logoUrl)
    // Hard-exclude vision-rejected images immediately
    .filter((u) => !visionScores || (visionScores.get(u)?.score ?? 0) > -9999);

  // Sort by (semantic + bonus + vision) descending before dedup so the best candidates get probed first
  const sortedFiltered = [...new Set(filtered)].sort((a, b) => {
    const vA = visionScores?.get(a)?.score ?? 0;
    const vB = visionScores?.get(b)?.score ?? 0;
    const scoreA = (semanticByUrl.get(a) || 0) + (bonusByUrl.get(a) || 0) + Math.max(0, vA);
    const scoreB = (semanticByUrl.get(b) || 0) + (bonusByUrl.get(b) || 0) + Math.max(0, vB);
    return scoreB - scoreA;
  });

  const uniq = dedupeImageUrls(sortedFiltered).slice(0, 60);

  logDebug(`  [pickTopImages] filtered=${filtered.length} uniq=${uniq.length}`);

  const MAX_PROBE = 40; // probe more candidates (was 25)
  const limit = pLimit(6);
  const probed = await Promise.all(
    uniq.slice(0, MAX_PROBE).map((u) =>
      limit(async () => {
        const info = await probeRemoteImage(u);
        if (!info) return null;
        const area = (info.width || 0) * (info.height || 0);
        const ratio = info.height ? Number(info.width || 0) / Number(info.height || 1) : 0;
        const bonus = bonusByUrl.get(u) || 0;
        const semantic = semanticByUrl.get(u) || 0;
        // Vision score: 0 when no vision data, clamped to positive values for the multiplier
        const visionData = visionScores?.get(u);
        const visionScore = Math.max(0, visionData?.score ?? 0);
        const visionLabel = visionData?.label ?? null;

        let score = area + Math.max(0, bonus) * 10_000 + semantic * 35_000 + visionScore * 45_000;
        if (ratio >= 1.05 && ratio <= 2.4) score += 120_000;
        if (ratio < 0.6 || ratio > 4.2) score -= 150_000;

        return { url: u, ...info, area, ratio, bonus, semantic, visionScore, visionLabel, score };
      })
    )
  );

  const probedOk = probed.filter(Boolean);
  const good = probedOk
    .filter((x) => x.area >= 300_000)
    .sort((a, b) => b.score - a.score);

  // Relaxed pool: images that were probed successfully but didn't meet the 300K area threshold
  // Still usable as fallback gallery images (minimum 40K area ≈ 200x200)
  const relaxed = probedOk
    .filter((x) => x.area >= 40_000 && x.area < 300_000)
    .sort((a, b) => b.score - a.score);

  logDebug(`  [pickTopImages] probed=${probedOk.length} good(≥300K)=${good.length} relaxed(≥40K)=${relaxed.length}`);

  // Cover selection: strongly prefer images with a visible human presence.
  // service_clean / neutral are only used as cover if no human-present image is available.
  const COVER_PREMIUM_LABELS = new Set(["human_team", "branding", "smile_client"]);
  const goodCoverPremium = good.filter((x) => !x.visionLabel || COVER_PREMIUM_LABELS.has(x.visionLabel));
  const goodCoverFallback = good.filter((x) => x.visionLabel && !COVER_PREMIUM_LABELS.has(x.visionLabel));
  const relaxedCoverPremium = relaxed.filter((x) => !x.visionLabel || COVER_PREMIUM_LABELS.has(x.visionLabel));
  // Pick best available cover in order: premium good → premium relaxed → fallback good → fallback relaxed → unprobed
  const cover =
    goodCoverPremium[0]?.url ||
    relaxedCoverPremium[0]?.url ||
    goodCoverFallback[0]?.url ||
    relaxed[0]?.url ||
    uniq[0] ||
    null;
  if (VISION_ENABLED && cover) {
    const coverLabel = [...good, ...relaxed].find((x) => x.url === cover)?.visionLabel ?? "no-vision";
    logDebug(`  [pickTopImages] cover label=${coverLabel}`);
  }
  const excludeCover = cover ? [cover] : [];

  // Build gallery: first from good (≥300K), then fill from relaxed, then from unprobed uniq
  const galleryFromGood = dedupeImageUrls(
    good.slice(cover === good[0]?.url ? 1 : 0, 20).map((x) => x.url).filter(Boolean),
    { exclude: excludeCover }
  );

  let gallery = galleryFromGood;

  // Fill from relaxed pool if not enough
  if (gallery.length < MIN_GALLERY_IMAGES) {
    const galleryFromRelaxed = dedupeImageUrls(
      relaxed.filter((x) => x.url !== cover).map((x) => x.url).filter(Boolean),
      { exclude: [...excludeCover, ...gallery] }
    );
    gallery = [...gallery, ...galleryFromRelaxed];
  }

  // Last resort: fill from unprobed unique URLs (images we couldn't probe but exist)
  if (gallery.length < MIN_GALLERY_IMAGES) {
    const probedUrls = new Set(probedOk.map((x) => x.url));
    const unprobedFallback = uniq.filter((u) => !probedUrls.has(u) && u !== cover && !gallery.includes(u));
    gallery = [...gallery, ...unprobedFallback];
  }

  gallery = dedupeImageUrls(gallery, { exclude: excludeCover }).slice(0, 10);

  logDebug(`  [pickTopImages] result: cover=${cover ? "yes" : "no"} gallery=${gallery.length}`);
  return { cover, gallery };
}

// Vision scoring labels and their numeric weights used in pickTopImages scoring.
// Priority order: human/team > branding/smile > service photos > reject
const VISION_LABEL_SCORE = {
  human_team:   100, // person(s) visibly working, team photo, artisan in action
  branding:      80, // branded vehicle, uniform with logo, storefront signage
  smile_client:  70, // smiling person, client interaction, handshake
  service_clean: 50, // clean result/before-after, equipment, finished work (no person)
  neutral:       20, // acceptable but no strong signal
  reject:      -9999, // screenshots, dirty/ugly, decorative stock, logos, text ads
};

async function scoreImagesWithVision(urls, { industry = "" } = {}) {
  if (!VISION_ENABLED || !urls?.length) return new Map();

  const schema = {
    type: "object",
    properties: {
      label: {
        type: "string",
        enum: ["human_team", "branding", "smile_client", "service_clean", "neutral", "reject"],
      },
      reason: { type: "string" },
    },
    required: ["label", "reason"],
  };

  const prompt = `Tu évalues une photo pour une fiche d'entreprise de "${industry}" en Belgique.
Classe-la avec le label le plus précis parmi :
- human_team   : personne(s) clairement visible(s) au travail, équipe, artisan en action, portrait pro
- branding     : véhicule avec logo/marque, uniforme brandé, devanture avec enseigne (UNIQUEMENT si une vraie scène réelle accompagne la marque)
- smile_client : personne qui sourit, interaction client, satisfaction visible
- service_clean: résultat propre / avant-après soigné / équipement métier (sans humain)
- neutral      : photo acceptable mais signal faible
- reject       : capture d'écran (Google, avis, étoiles), intervention sale ou inutilisable, photo décorative générique, banque d'image, photo floue — ET OBLIGATOIREMENT tout logo typographique (nom d'entreprise, symbole, slogan, texte de marque seul, fond coloré ou sombre avec texte), même s'il semble professionnel
RÈGLE ABSOLUE : si l'image est un logo (texte + marque visuels uniquement, sans vraie photo de personne, véhicule ou résultat de travail) → reject, sans exception.
Réponds uniquement en JSON strict.`;

  const scores = new Map();
  const limit = pLimit(2);

  await Promise.all(
    urls.slice(0, VISION_TOP_N).map((url) =>
      limit(async () => {
        try {
          const res = await fetchWithTimeout(
            url,
            { headers: { "User-Agent": USER_AGENT, Accept: "image/*" } },
            10000
          );
          if (!res || !res.ok) return;
          const buf = Buffer.from(await res.arrayBuffer());
          if (buf.length < 4096) return; // too small to be meaningful
          const b64 = buf.toString("base64");

          const body = {
            model: VISION_MODEL,
            messages: [{ role: "user", content: prompt, images: [b64] }],
            format: schema,
            options: { temperature: 0.05, num_ctx: 2048 },
            stream: false,
          };

          const vRes = await fetchWithTimeout(
            `${OLLAMA_URL}/api/chat`,
            { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify(body) },
            45000
          );
          if (!vRes || !vRes.ok) return;
          const json = await vRes.json();
          const raw = json?.message?.content;
          if (!raw) return;
          const parsed = typeof raw === "object" ? raw : JSON.parse(raw);
          const label = parsed?.label;
          if (!label || !(label in VISION_LABEL_SCORE)) return;

          scores.set(url, { label, score: VISION_LABEL_SCORE[label], reason: parsed.reason || "" });
          logDebug(`  [Vision] ${url.split("/").slice(-1)[0].slice(0, 40)} → ${label} (${VISION_LABEL_SCORE[label]})`);
        } catch {
          // Vision failure is non-fatal — image keeps heuristic score
        }
      })
    )
  );

  return scores;
}

function looksCorporateImagePool(imageCandidates) {
  const rows = (imageCandidates || [])
    .filter((x) => x && x.url)
    .map((x) => ({
      url: String(x.url || ""),
      bonus: Number(x.bonus || 0),
      semantic: Number(x.semantic || 0),
    }));
  if (!rows.length) return false;

  const top = rows
    .slice()
    .sort((a, b) => b.semantic + b.bonus * 0.2 - (a.semantic + a.bonus * 0.2))
    .slice(0, 12);

  const strongHuman = top.filter((x) => x.semantic >= 8).length;
  const weak = top.filter((x) => x.semantic <= 1).length;
  const urlBlob = normalizeBceForTokenMatch(top.map((x) => x.url).join(" "));
  const decorativeHits = (urlBlob.match(/\b(hero|banner|header|stock|placeholder|background|slider)\b/g) || []).length;

  if (strongHuman >= 2) return false;
  if (weak >= Math.max(4, Math.floor(top.length * 0.6))) return true;
  if (decorativeHits >= 3 && strongHuman === 0) return true;
  return false;
}

// -------------------- SUPABASE --------------------
const supabase =
  SUPABASE_URL && SUPABASE_SERVICE_ROLE_KEY
    ? createClient(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY, { auth: { persistSession: false } })
    : null;

async function verifyBceInKbo(bce10) {
  if (!supabase) return { ok: null, status: null };

  const tryByColumn = async (col) => {
    const { data, error } = await supabase.from("kbo_entities").select("status").eq(col, bce10).maybeSingle();
    return { data, error };
  };

  // Prefer the current schema (`bce_number`), but keep backward compatibility.
  const first = await tryByColumn("bce_number");
  if (!first.error && first.data) return { ok: true, status: first.data.status || "active" };

  const second = await tryByColumn("enterprise_number");
  if (!second.error && second.data) return { ok: true, status: second.data.status || "active" };

  // If at least one query was valid but no row matched, BCE is not confirmed.
  if (!first.error || !second.error) return { ok: false, status: null };

  // Both queries errored (schema/access issue).
  return { ok: null, status: null };
}

async function uploadImageWebp(publicPath, bufferWebp) {
  if (!supabase) return null;
  const { error } = await supabase.storage.from(SUPABASE_BUCKET).upload(publicPath, bufferWebp, {
    contentType: "image/webp",
    upsert: true,
    cacheControl: "31536000",
  });
  if (error) return null;
  const { data } = supabase.storage.from(SUPABASE_BUCKET).getPublicUrl(publicPath);
  return data?.publicUrl || null;
}

// -------------------- SEARCH --------------------
function stripHtmlTags(s) {
  return String(s || "").replace(/<[^>]+>/g, " ");
}

// ---- Serper (Google) API ----
const serperLimit = pLimit(1);
let serperLastAt = 0;

function isSerperRetryableStatus(status) {
  return status === 408 || status === 429 || status >= 500;
}

async function serperThrottle() {
  if (!SERPER_MIN_DELAY_MS) return;
  const now = Date.now();
  if (!serperLastAt) {
    serperLastAt = now;
    return;
  }
  const wait = serperLastAt + SERPER_MIN_DELAY_MS - now;
  if (wait > 0) await sleep(wait);
  serperLastAt = Date.now();
}

async function serperWebSearchRaw(query, { count = SERPER_COUNT, page = 1 } = {}) {
  if (!SERPER_API_KEY) throw new Error("SERPER_API_KEY missing (required when SEARCH_PROVIDER=serper).");

  const n = Math.max(1, Math.min(20, Number(count) || SERPER_COUNT));
  const p = Math.max(1, Number(page) || 1);
  const body = { q: query, gl: SERPER_GL, hl: SERPER_HL, num: n, page: p };

  return await serperLimit(async () => {
    await serperThrottle();

    for (let attempt = 0; attempt <= SERPER_MAX_RETRIES; attempt++) {
      try {
        const res = await fetchWithTimeout(
          SERPER_ENDPOINT,
          {
            method: "POST",
            headers: {
              Accept: "application/json",
              "Content-Type": "application/json",
              "User-Agent": USER_AGENT,
              "X-API-KEY": SERPER_API_KEY,
            },
            body: JSON.stringify(body),
          },
          SERPER_TIMEOUT_MS
        );

        if (isSerperRetryableStatus(res.status) && attempt < SERPER_MAX_RETRIES) {
          const exp = Math.min(SERPER_BACKOFF_MAX_MS, SERPER_BACKOFF_BASE_MS * 2 ** attempt);
          const jitter = Math.floor(Math.random() * 400);
          const delay = exp + jitter;
          console.warn(`   Serper rate-limited/unstable (HTTP ${res.status}); retry in ${delay}ms...`);
          await sleep(delay);
          continue;
        }

        if (!res.ok) {
          const txt = await res.text().catch(() => "");
          throw new Error(`Serper search HTTP ${res.status}: ${txt.slice(0, 240)}`);
        }

        return await res.json();
      } catch (err) {
        if (attempt < SERPER_MAX_RETRIES) {
          const exp = Math.min(SERPER_BACKOFF_MAX_MS, SERPER_BACKOFF_BASE_MS * 2 ** attempt);
          const jitter = Math.floor(Math.random() * 400);
          const delay = exp + jitter;
          console.warn(`   Serper search error; retry in ${delay}ms... (${String(err?.message || err)})`);
          await sleep(delay);
          continue;
        }
        throw err;
      }
    }

    return null;
  });
}

function serperResultsFromJson(json) {
  const organic = json?.organic || [];
  return organic
    .map((r) => ({
      title: normSpaces(r.title || ""),
      url: safeUrl(r.link || r.url),
      snippet: normSpaces(stripHtmlTags(r.snippet || r.description || "")),
    }))
    .filter((r) => r.url);
}

async function serperSearch(query, { count = SERPER_COUNT, page = 1 } = {}) {
  const json = await serperWebSearchRaw(query, { count, page });
  const urls = serperResultsFromJson(json).filter((r) => r.url && !isBlacklisted(r.url));

  const seen = new Set();
  const uniq = [];
  for (const r of urls) {
    const d = domainOf(r.url);
    if (seen.has(d)) continue;
    seen.add(d);
    uniq.push(r);
  }
  return uniq;
}

// ---- Brave Search API ----
const braveLimit = pLimit(1);
let braveLastAt = 0;

function isBraveRetryableStatus(status) {
  return status === 429 || status >= 500;
}

async function braveThrottle() {
  if (!BRAVE_MIN_DELAY_MS) return;
  const now = Date.now();
  if (!braveLastAt) {
    braveLastAt = now;
    return;
  }
  const wait = braveLastAt + BRAVE_MIN_DELAY_MS - now;
  if (wait > 0) await sleep(wait);
  braveLastAt = Date.now();
}

async function braveWebSearchRaw(query, { count = BRAVE_COUNT, offset = 0 } = {}) {
  if (!BRAVE_API_KEY) throw new Error("BRAVE_API_KEY missing (required when SEARCH_PROVIDER=brave).");

  const params = new URLSearchParams();
  params.set("q", query);
  params.set("count", String(Math.max(1, Math.min(20, Number(count) || BRAVE_COUNT))));
  params.set("country", BRAVE_COUNTRY);
  params.set("search_lang", BRAVE_SEARCH_LANG);
  params.set("safesearch", BRAVE_SAFESEARCH);
  if (offset) params.set("offset", String(Math.max(0, Number(offset) || 0)));

  const url = `https://api.search.brave.com/res/v1/web/search?${params.toString()}`;

  return await braveLimit(async () => {
    await braveThrottle();

    for (let attempt = 0; attempt <= BRAVE_MAX_RETRIES; attempt++) {
      try {
        const res = await fetchWithTimeout(
          url,
          {
            method: "GET",
            headers: {
              Accept: "application/json",
              "User-Agent": USER_AGENT,
              "X-Subscription-Token": BRAVE_API_KEY,
            },
          },
          BRAVE_TIMEOUT_MS
        );

        if (isBraveRetryableStatus(res.status) && attempt < BRAVE_MAX_RETRIES) {
          const exp = Math.min(BRAVE_BACKOFF_MAX_MS, BRAVE_BACKOFF_BASE_MS * 2 ** attempt);
          const jitter = Math.floor(Math.random() * 400);
          const delay = exp + jitter;
          console.warn(`   Brave rate-limited/unstable (HTTP ${res.status}); retry in ${delay}ms...`);
          await sleep(delay);
          continue;
        }

        if (!res.ok) {
          const body = await res.text().catch(() => "");
          throw new Error(`Brave search HTTP ${res.status}: ${body.slice(0, 240)}`);
        }

        return await res.json();
      } catch (err) {
        if (attempt < BRAVE_MAX_RETRIES) {
          const exp = Math.min(BRAVE_BACKOFF_MAX_MS, BRAVE_BACKOFF_BASE_MS * 2 ** attempt);
          const jitter = Math.floor(Math.random() * 400);
          const delay = exp + jitter;
          console.warn(`   Brave search error; retry in ${delay}ms... (${String(err?.message || err)})`);
          await sleep(delay);
          continue;
        }
        throw err;
      }
    }

    return null;
  });
}

function braveResultsFromJson(json) {
  const results = json?.web?.results || [];
  return results
    .map((r) => ({
      title: normSpaces(r.title || ""),
      url: safeUrl(r.url),
      snippet: normSpaces(stripHtmlTags(r.description || r.snippet || "")),
    }))
    .filter((r) => r.url);
}

async function braveSearch(query, { count = BRAVE_COUNT, offset = 0 } = {}) {
  const json = await braveWebSearchRaw(query, { count, offset });
  const urls = braveResultsFromJson(json).filter((r) => r.url && !isBlacklisted(r.url));

  const seen = new Set();
  const uniq = [];
  for (const r of urls) {
    const d = domainOf(r.url);
    if (seen.has(d)) continue;
    seen.add(d);
    uniq.push(r);
  }
  return uniq;
}

const ddgLimit = pLimit(1);
let ddgLastAt = 0;

function isDdgAnomalyError(err) {
  const msg = String(err?.message || err || "");
  return /anomaly/i.test(msg) || /anomalydetectionblock/i.test(msg);
}

function isDdgRetryableError(err) {
  const msg = String(err?.message || err || "");
  if (!msg) return false;
  if (isDdgAnomalyError(err)) return true;

  // duck-duck-scrape can throw TypeError when DDG returns an unexpected body (regex mismatch)
  if (/Cannot read properties of null \(reading ['"]1['"]\)/i.test(msg)) return true;

  // VQD fetch can fail (blocked / throttled)
  if (/Failed to get the VQD/i.test(msg)) return true;

  // network-ish
  if (/(ECONNRESET|ETIMEDOUT|ENOTFOUND|EAI_AGAIN)/i.test(msg)) return true;

  // generic server error from the library
  if (/server error occurred/i.test(msg)) return true;

  return false;
}

async function ddgThrottle() {
  if (!DDG_MIN_DELAY_MS) return;
  const now = Date.now();
  if (!ddgLastAt) {
    ddgLastAt = now;
    return;
  }
  const wait = ddgLastAt + DDG_MIN_DELAY_MS - now;
  if (wait > 0) await sleep(wait);
  ddgLastAt = Date.now();
}

async function ddgWebSearchRaw(query) {
  return await ddgLimit(async () => {
    await ddgThrottle();

    for (let attempt = 0; attempt <= DDG_MAX_RETRIES; attempt++) {
      try {
        return await ddgSearchLib(query, { safeSearch: DDG_SAFESEARCH, locale: DDG_LOCALE }, DDG_NEEDLE_OPTIONS);
      } catch (err) {
        if (isDdgRetryableError(err) && attempt < DDG_MAX_RETRIES) {
          const exp = Math.min(DDG_BACKOFF_MAX_MS, DDG_BACKOFF_BASE_MS * 2 ** attempt);
          const jitter = Math.floor(Math.random() * 400);
          const delay = exp + jitter;
          console.warn(`   DDG search blocked/unstable; retry in ${delay}ms...`);
          await sleep(delay);
          continue;
        }
        throw err;
      }
    }

    return { results: [] };
  });
}

async function ddgImageSearchRaw(query) {
  const searchImages = interopNamed(ddgMod, "searchImages");
  if (typeof searchImages !== "function") return null;

  return await ddgLimit(async () => {
    await ddgThrottle();
    try {
      return await searchImages(query, { safeSearch: DDG_SAFESEARCH, locale: DDG_LOCALE }, DDG_NEEDLE_OPTIONS);
    } catch {
      return null;
    }
  });
}

function resolveDuckDuckGoRedirect(href) {
  if (!href) return null;
  const raw = String(href || "").trim();
  if (!raw) return null;

  let abs = raw;
  if (raw.startsWith("/")) abs = `https://duckduckgo.com${raw}`;

  try {
    const u = new URL(abs);
    const uddg = u.searchParams.get("uddg");
    if (uddg) return safeUrl(decodeURIComponent(uddg));

    // Sometimes DDG uses other params; keep the absolute URL as a fallback.
    return safeUrl(abs);
  } catch {
    return null;
  }
}

async function ddgLiteSearch(query) {
  return await ddgLimit(async () => {
    await ddgThrottle();

    const url = `https://lite.duckduckgo.com/lite/?q=${encodeURIComponent(query)}`;
    const res = await fetchWithTimeout(
      url,
      {
        redirect: "follow",
        headers: {
          "User-Agent": DDG_NEEDLE_OPTIONS.headers?.["user-agent"] || USER_AGENT,
          "Accept-Language": DDG_NEEDLE_OPTIONS.headers?.["accept-language"] || "fr-BE,fr;q=0.9,en;q=0.8",
          Accept: "text/html,application/xhtml+xml",
        },
      },
      DDG_REQUEST_TIMEOUT_MS
    );

    if (!res.ok) throw new Error(`DDG lite HTTP ${res.status}`);
    const html = await res.text();

    if (/anomalydetectionblock|anomaly/i.test(html)) {
      throw new Error("DDG detected an anomaly in the request, you are likely making requests too quickly.");
    }

    const $ = cheerio.load(html);
    const out = [];

    $("a[href]").each((_, el) => {
      const href = $(el).attr("href");
      const url = resolveDuckDuckGoRedirect(href);
      if (!url) return;
      const dom = domainOf(url);
      if (!dom) return;
      if (dom.includes("duckduckgo.com")) return;

      const title = normSpaces($(el).text()) || dom;
      out.push({ title, url, snippet: "" });
    });

    return out;
  });
}

async function ddgSearch(query, { count = 10 } = {}) {
  const n = Math.max(1, Math.min(20, Number(count) || 10));
  let urls = [];
  try {
    const res = await ddgWebSearchRaw(query);
    urls = (res?.results || []).map((r) => ({ title: r.title, url: safeUrl(r.url), snippet: r.description || "" }));
  } catch (err) {
    console.warn(`   DDG search() failed; trying lite endpoint. (${String(err?.message || err)})`);
    try {
      urls = await ddgLiteSearch(query);
    } catch (err2) {
      console.warn(`   DDG lite failed too. (${String(err2?.message || err2)})`);
      return [];
    }
  }

  urls = urls.filter((r) => r.url && !isBlacklisted(r.url));
  const seen = new Set();
  const uniq = [];
  for (const r of urls) {
    const d = domainOf(r.url);
    if (seen.has(d)) continue;
    seen.add(d);
    uniq.push(r);
  }
  return uniq.slice(0, n);
}

async function searchWeb(query, { count = 10, page = 1, offset = 0 } = {}) {
  const n = Math.max(1, Math.min(20, Number(count) || 10));
  const p = Math.max(1, Number(page) || 1);
  const o = Math.max(0, Number(offset) || 0);

  if (SEARCH_PROVIDER === "serper") {
    try {
      return await serperSearch(query, { count: n, page: p });
    } catch (err) {
      if (BRAVE_API_KEY) {
        console.warn(`   Serper failed; falling back to Brave. (${String(err?.message || err)})`);
        const fallbackOffset = Number.isFinite(o) && o > 0 ? o : (p - 1) * n;
        return await braveSearch(query, { count: n, offset: fallbackOffset });
      }
      throw err;
    }
  }
  if (SEARCH_PROVIDER === "brave") return await braveSearch(query, { count: n, offset: o });
  return await ddgSearch(query, { count: n });
}

async function searchRawResults(query, { count = 10 } = {}) {
  const n = Math.max(1, Math.min(20, Number(count) || 10));

  if (SEARCH_PROVIDER === "serper") {
    try {
      const json = await serperWebSearchRaw(query, { count: n });
      return serperResultsFromJson(json).slice(0, n);
    } catch (err) {
      if (BRAVE_API_KEY) {
        console.warn(`   Serper failed (raw); falling back to Brave. (${String(err?.message || err)})`);
        const json = await braveWebSearchRaw(query, { count: n });
        return braveResultsFromJson(json).slice(0, n);
      }
      throw err;
    }
  }

  if (SEARCH_PROVIDER === "brave") {
    const json = await braveWebSearchRaw(query, { count: n });
    return braveResultsFromJson(json).slice(0, n);
  }

  try {
    const res = await ddgWebSearchRaw(query);
    return (res?.results || [])
      .map((r) => ({ title: r.title, url: safeUrl(r.url), snippet: r.description || "" }))
      .filter((r) => r.url)
      .slice(0, n);
  } catch {
    try {
      return (await ddgLiteSearch(query)).slice(0, n);
    } catch {
      return [];
    }
  }
}

// -------------------- FALLBACK (RECOVERY) --------------------
async function tryRecoverEmailFromSnippets(name, city, websiteUrl) {
  if (!RECOVER_EMAIL) return null;
  console.log("   Fallback email via social snippets...");
  const q = `"${name}" "${city}" email site:facebook.com OR site:instagram.com`;
  try {
    const res = await searchRawResults(q, { count: 10 });
    const snippets = res.map((r) => r.snippet || "").join(" ");
    const emails = extractEmailsFromText(snippets);
    if (!emails.length) return null;
    return pickBestEmail(emails, websiteUrl);
  } catch {
    return null;
  }
}

/**
 * Extract the page slug/username from a Facebook URL.
 * e.g. "https://www.facebook.com/dmsynergie/" → "dmsynergie"
 */
function fbPageSlugFromUrl(rawUrl) {
  try {
    const u = new URL(rawUrl);
    const segs = u.pathname.split("/").filter(Boolean);
    if (!segs.length) return null;
    const first = segs[0].toLowerCase();
    // Skip non-page paths
    if (["pages", "groups", "events", "watch", "profile.php"].includes(first)) return null;
    return segs[0]; // keep original casing
  } catch {
    return null;
  }
}

/**
 * Fetch Facebook page images via Graph API.
 * Returns { profilePic, cover } — each is a URL string or null.
 *
 * - profilePic: always works with App Token (no review needed), up to original size
 * - cover: requires Page Public Content Access (Meta review) — gracefully returns null if denied
 */
async function fetchFacebookImages(facebookPageUrl) {
  if (!FB_APP_ID || !FB_APP_SECRET) return { profilePic: null, cover: null };
  const slug = fbPageSlugFromUrl(facebookPageUrl);
  if (!slug) return { profilePic: null, cover: null };
  const accessToken = `${FB_APP_ID}|${FB_APP_SECRET}`;

  let profilePic = null;
  let cover = null;

  // 1) Profile picture — works without special permissions
  try {
    const res = await fetchWithTimeout(
      `https://graph.facebook.com/v22.0/${slug}/picture?width=960&redirect=false&access_token=${accessToken}`,
      { redirect: "follow" },
      FETCH_TIMEOUT_MS
    );
    if (res.ok) {
      const data = await res.json();
      profilePic = data.data?.url || null;
      if (profilePic) {
        const w = data.data?.width || "?";
        const h = data.data?.height || "?";
        logInfo(`   [FB] profile pic ${w}x${h}: ${profilePic.slice(0, 100)}...`);
      }
    }
  } catch (err) {
    logInfo(`   [FB] profile pic error: ${err?.message || err}`);
  }

  // 2) Cover photo — requires Page Public Content Access (Meta review)
  //    Gracefully skips if permission denied.
  try {
    const res = await fetchWithTimeout(
      `https://graph.facebook.com/v22.0/${slug}?fields=cover{source}&access_token=${accessToken}`,
      { redirect: "follow" },
      FETCH_TIMEOUT_MS
    );
    if (res.ok) {
      const data = await res.json();
      cover = data.cover?.source || null;
      if (cover) {
        logInfo(`   [FB] cover photo: ${cover.slice(0, 100)}...`);
      }
    } else {
      logDebug(`   [FB] cover endpoint returned ${res.status} (needs Page Public Content Access review)`);
    }
  } catch {
    // silently ignore — permission not granted yet
  }

  if (!profilePic && !cover) {
    logInfo(`   [FB] no images retrieved for ${facebookPageUrl}`);
  }
  return { profilePic, cover };
}

async function tryRecoverImages(name, city, websiteUrl, { socialUrls = [] } = {}) {
  if (!RECOVER_IMAGES) return { cover: null, gallery: [], candidates: [] };

  console.log("   Fallback images via search...");
  const recovered = new Map();
  const websiteDomain = domainOf(websiteUrl || "");

  const socialPlatformFromUrl = (u) => {
    const host = domainOf(u);
    if (!host) return null;
    if (host.endsWith("facebook.com") || host.endsWith("fb.com") || host === "fb.me") return "facebook";
    if (host.endsWith("instagram.com")) return "instagram";
    if (host.endsWith("linkedin.com") || host.endsWith("linkedin.be")) return "linkedin";
    if (host.endsWith("tiktok.com")) return "tiktok";
    return null;
  };

  const isIgnoredSocialImagePageUrl = (rawUrl) => {
    const u = safeUrl(rawUrl);
    if (!u) return true;
    try {
      const parsed = new URL(u);
      const host = parsed.hostname.toLowerCase();
      const path = String(parsed.pathname || "/").toLowerCase();
      const segs = path.split("/").filter(Boolean).map((x) => x.toLowerCase());
      const rootOrEmpty = segs.length === 0;

      if (/\/(share|sharer|plugins|intent)\b/.test(path)) return true;
      if (/\/(privacy|policies|policy|help|terms|legal|login|checkpoint)\b/.test(path)) return true;

      if (host.endsWith("facebook.com") || host.endsWith("fb.com") || host === "fb.me") {
        const blocked = new Set(["pages", "watch", "groups", "events", "marketplace", "gaming", "search", "hashtag", "about", "business"]);
        if (rootOrEmpty || blocked.has(segs[0])) return true;
      }
      if (host.endsWith("instagram.com")) {
        const blocked = new Set(["explore", "accounts", "stories", "about", "developer", "legal"]);
        if (rootOrEmpty || blocked.has(segs[0])) return true;
      }
      if (host.endsWith("linkedin.com") || host.endsWith("linkedin.be")) {
        const blocked = new Set(["feed", "jobs", "learning", "help", "signup", "authwall", "news", "pulse"]);
        if (rootOrEmpty || blocked.has(segs[0])) return true;
      }
      if (host.endsWith("tiktok.com")) {
        const head = segs[0] || "";
        const blocked = new Set(["discover", "tag", "foryou", "login", "about"]);
        if (rootOrEmpty || blocked.has(head)) return true;
        if (!head.startsWith("@")) return true;
      }
      return false;
    } catch {
      return true;
    }
  };

  const pagePriority = (rawUrl) => {
    const u = safeUrl(rawUrl);
    if (!u) return -100;
    const platform = socialPlatformFromUrl(u);
    let score = platform ? 6 : 0;
    const host = domainOf(u);
    if (websiteDomain && host === websiteDomain) score += 4;
    try {
      const p = normalizeBceForTokenMatch(new URL(u).pathname || "");
      if (/\b(p|reel|reels|post|posts|photo|photos|gallery|portfolio|projets?)\b/.test(p)) score += 5;
      if (/\b(contact|about|mentions|privacy|legal|terms)\b/.test(p)) score -= 6;
    } catch {
      // ignore
    }
    return score;
  };

  const addRecovered = (rawUrl, { baseUrl = null, bonus = 0, semantic = 0 } = {}) => {
    const abs = normalizePublicUrl(rawUrl, baseUrl);
    if (!abs || isBadImageUrl(abs) || isLikelyLogoUrl(abs)) return;
    const key = canonicalImageUrlKey(abs) || abs;
    const prev = recovered.get(key) || { url: abs, bonus: 0, semantic: 0 };
    prev.url = abs;
    prev.bonus += Number(bonus || 0);
    prev.semantic += Number(semantic || 0);
    recovered.set(key, prev);
  };

  const collectJsonLdImages = (value, push, depth = 0) => {
    if (depth > 8 || value === null || value === undefined) return;
    if (typeof value === "string") return;
    if (Array.isArray(value)) {
      value.forEach((x) => collectJsonLdImages(x, push, depth + 1));
      return;
    }
    if (typeof value !== "object") return;

    for (const [k, v] of Object.entries(value)) {
      const key = String(k || "").toLowerCase();
      const imageLike = key === "image" || key === "thumbnailurl" || key === "contenturl";
      if (imageLike) {
        if (typeof v === "string") {
          push(v);
        } else if (Array.isArray(v)) {
          for (const item of v) {
            if (typeof item === "string") push(item);
            else if (item && typeof item === "object") {
              if (typeof item.url === "string") push(item.url);
              if (typeof item.contentUrl === "string") push(item.contentUrl);
            }
          }
        } else if (v && typeof v === "object") {
          if (typeof v.url === "string") push(v.url);
          if (typeof v.contentUrl === "string") push(v.contentUrl);
        }
      }
      collectJsonLdImages(v, push, depth + 1);
    }
  };

  const extractFromPageHtml = (html, pageUrl) => {
    const $ = cheerio.load(html);
    const pagePlatform = socialPlatformFromUrl(pageUrl);
    const fromSocial = Boolean(pagePlatform);
    const baseBonus = pagePriority(pageUrl) + (fromSocial ? 4 : 1);
    let pagePathNorm = "";
    try {
      pagePathNorm = normalizeBceForTokenMatch(new URL(pageUrl).pathname || "");
    } catch {
      pagePathNorm = "";
    }
    const fbProfileLikeRoot = pagePlatform === "facebook" && !/\b(posts?|photos?|reels?|videos?)\b/.test(pagePathNorm);

    $('meta[property="og:image"], meta[property="og:image:secure_url"]').each((_, el) => {
      let bonus = baseBonus + 12;
      let semantic = 4;
      if (pagePlatform === "facebook") {
        // Prefer Facebook page cover/banner when present.
        bonus += fbProfileLikeRoot ? 48 : 24;
        semantic += 4;
      }
      addRecovered($(el).attr("content"), { baseUrl: pageUrl, bonus, semantic });
    });
    $('meta[name="twitter:image"], meta[name="twitter:image:src"], meta[itemprop="image"], link[rel="image_src"]').each((_, el) => {
      let bonus = baseBonus + 8;
      let semantic = 2;
      if (pagePlatform === "facebook" && fbProfileLikeRoot) {
        bonus += 18;
        semantic += 2;
      }
      addRecovered($(el).attr("content") || $(el).attr("href"), { baseUrl: pageUrl, bonus, semantic });
    });

    try {
      const flat = flattenJsonLd(extractJsonLd($));
      for (const obj of flat) {
        collectJsonLdImages(obj, (u) => addRecovered(u, { baseUrl: pageUrl, bonus: baseBonus + 7, semantic: 2 }));
      }
    } catch {
      // ignore
    }

    $("img").each((_, el) => {
      const src =
        $(el).attr("src") ||
        $(el).attr("data-src") ||
        $(el).attr("data-lazy-src") ||
        $(el).attr("data-original") ||
        pickFromSrcset($(el).attr("srcset"));
      if (!src) return;

      const alt = String($(el).attr("alt") || "");
      const title = String($(el).attr("title") || "");
      const cls = String($(el).attr("class") || "");
      const id = String($(el).attr("id") || "");
      const contextHint = [
        $(el).closest("section,article,main,header,footer,nav").attr("class") || "",
        $(el).closest("section,article,main,header,footer,nav").attr("id") || "",
        $(el).parent().attr("class") || "",
      ].join(" ");
      let semantic = imageSemanticScore({ src, alt, title, cls, id, pageUrl, contextHint });

      let bonus = baseBonus + 1;
      if ($(el).closest("main,article,.content,.entry,.portfolio,.projects,.gallery,.feed,.post").length) bonus += 4;
      if ($(el).closest(".hero,.banner,[class*='hero'],[id*='hero']").length) bonus += 2;
      if ($(el).closest("header,nav,footer,.site-header,.navbar,.menu").length) bonus -= 6;
      if (pagePlatform === "facebook") {
        const srcNorm = normalizeBceForTokenMatch(src);
        if (/\b(cover|banner|timeline)\b/.test(srcNorm)) {
          bonus += 26;
          semantic += 4;
        } else if (fbProfileLikeRoot) {
          bonus += 10;
        }
      }

      addRecovered(src, { baseUrl: pageUrl, bonus, semantic });
    });

    $("source[srcset]").each((_, el) => {
      const picked = pickFromSrcset($(el).attr("srcset"));
      if (picked) addRecovered(picked, { baseUrl: pageUrl, bonus: baseBonus + 1, semantic: 1 });
    });

    $("a[href]").each((_, el) => {
      const href = $(el).attr("href");
      if (!href) return;
      if (!/\.(?:jpe?g|png|webp|gif)(?:[?#].*)?$/i.test(href)) return;
      addRecovered(href, { baseUrl: pageUrl, bonus: baseBonus + 2, semantic: 1 });
    });
  };

  const pagesToFetch = new Map();
  const pushPage = (rawUrl, bonus = 0) => {
    const u = normalizePublicUrl(rawUrl);
    if (!u) return;
    const platform = socialPlatformFromUrl(u);
    if (platform) {
      if (isIgnoredSocialImagePageUrl(u)) return;
    } else if (websiteDomain && domainOf(u) !== websiteDomain) {
      return;
    }
    const score = pagePriority(u) + Number(bonus || 0);
    const prev = pagesToFetch.get(u);
    if (prev === undefined || score > prev) pagesToFetch.set(u, score);
  };

  // 1) Attempt via image search if available in the lib version.
  try {
    const q = `"${name}" "${city}"`;
    const imgRes = await ddgImageSearchRaw(q);
    const imgs = (imgRes?.results || []).map((x) => x.image || x.thumbnail || x.url).filter(Boolean);
    imgs.slice(0, 30).forEach((u) => addRecovered(u, { bonus: 2, semantic: 0 }));
  } catch {
    // ignore
  }

  // 2) Build social pages list from known social URLs (site/BCE).
  for (const raw of socialUrls || []) {
    const u = normalizePublicUrl(raw);
    if (!u) continue;
    pushPage(u, 8);
    try {
      const parsed = new URL(u);
      const host = parsed.hostname;
      const segs = String(parsed.pathname || "/")
        .split("/")
        .filter(Boolean);
      const platform = socialPlatformFromUrl(u);
      if (platform === "facebook" && segs.length >= 1 && !/^profile\.php$/i.test(segs[0])) {
        pushPage(`https://${host}/${segs[0]}/photos`, 7);
        pushPage(`https://${host}/${segs[0]}/posts`, 6);
      } else if (platform === "instagram" && segs[0] && segs[0].startsWith("@")) {
        pushPage(`https://${host}/${segs[0]}/`, 7);
      } else if (platform === "tiktok" && segs[0] && segs[0].startsWith("@")) {
        pushPage(`https://${host}/${segs[0]}`, 7);
      }
    } catch {
      // ignore
    }
  }

  // 3) Search likely social pages/posts and official pages.
  try {
    const domainPart = websiteDomain ? ` OR site:${websiteDomain}` : "";
    const q = `"${name}" "${city}" (site:facebook.com OR site:instagram.com OR site:linkedin.com OR site:tiktok.com${domainPart})`;
    const res = await searchRawResults(q, { count: 12 });
    for (const r of res || []) {
      pushPage(r.url, 2);
    }
  } catch {
    // ignore
  }

  const pageList = Array.from(pagesToFetch.entries())
    .sort((a, b) => b[1] - a[1])
    .map(([u]) => u)
    .slice(0, 10);

  for (const pageUrl of pageList) {
    const platform = socialPlatformFromUrl(pageUrl);

    // Facebook: prefer Graph API (reliable) over HTML fetch (usually blocked)
    if (platform === "facebook") {
      const { profilePic, cover } = await fetchFacebookImages(pageUrl);
      if (cover) addRecovered(cover, { baseUrl: pageUrl, bonus: 70, semantic: 10 });
      if (profilePic) addRecovered(profilePic, { baseUrl: pageUrl, bonus: 50, semantic: 6 });
      if (cover || profilePic) continue; // Graph API succeeded, skip HTML fetch
      // Graph API unavailable/failed → fall through to HTML fetch
    }

    const isSocial = Boolean(platform);
    const ua = isSocial ? BROWSER_USER_AGENT : USER_AGENT;
    try {
      const html = await fetchHtml(pageUrl, { userAgent: ua });
      if (!html) {
        logInfo(`   [Recovery] fetch failed for ${pageUrl} (empty/blocked)`);
        continue;
      }
      logInfo(`   [Recovery] fetched ${platform || "page"}: ${pageUrl} (${(html.length / 1024).toFixed(0)} KB)`);
      extractFromPageHtml(html, pageUrl);
    } catch (err) {
      logInfo(`   [Recovery] error fetching ${pageUrl}: ${err?.message || err}`);
    }
  }

  const list = Array.from(recovered.values())
    .map((x) => ({ url: x.url, bonus: x.bonus, semantic: x.semantic }))
    .sort((a, b) => Number(b.semantic || 0) + Number(b.bonus || 0) - (Number(a.semantic || 0) + Number(a.bonus || 0)));

  logInfo(`   [Recovery] total recovered images: ${list.length} (from ${pageList.length} pages fetched)`);
  const { cover, gallery } = await pickTopImages(list, null);
  return { cover, gallery, candidates: list };
}

async function recoverSocialLogoCandidates(socialUrls = []) {
  const pages = Array.from(
    new Set(
      (socialUrls || [])
        .map((u) => normalizeSocialProfileUrl(u))
        .filter(Boolean)
    )
  ).slice(0, 5);
  if (!pages.length) return [];

  const scoreByUrl = new Map();
  const add = (rawUrl, { baseUrl = null, score = 0 } = {}) => {
    const abs = normalizePublicUrl(rawUrl, baseUrl);
    if (!abs || isBadLogoUrl(abs)) return;
    const norm = normalizeBceForTokenMatch(abs);
    let s = Number(score || 0);
    if (/\b(profile|avatar|logo|brand|photo)\b/.test(norm)) s += 14;
    if (/\b(scontent|cdninstagram|licdn|tiktokcdn|fbcdn)\b/.test(norm)) s += 6;
    const prev = scoreByUrl.get(abs) || Number.NEGATIVE_INFINITY;
    if (s > prev) scoreByUrl.set(abs, s);
  };

  for (const pageUrl of pages) {
    let platform = null;
    try {
      const host = domainOf(pageUrl);
      if (host.endsWith("facebook.com") || host.endsWith("fb.com") || host === "fb.me") platform = "facebook";
      else if (host.endsWith("instagram.com")) platform = "instagram";
      else if (host.endsWith("linkedin.com") || host.endsWith("linkedin.be")) platform = "linkedin";
      else if (host.endsWith("tiktok.com")) platform = "tiktok";
    } catch {
      platform = null;
    }

    const platformBase =
      platform === "facebook" ? 36 : platform === "instagram" ? 32 : platform === "linkedin" ? 30 : platform === "tiktok" ? 28 : 20;

    // Facebook: try Graph API first for profile picture (logo)
    if (platform === "facebook") {
      const { profilePic } = await fetchFacebookImages(pageUrl);
      if (profilePic) {
        add(profilePic, { baseUrl: pageUrl, score: platformBase + 50 });
        logInfo(`   [LogoRecovery] got Facebook profile pic via Graph API`);
        continue;
      }
    }

    const html = await fetchHtml(pageUrl, { userAgent: BROWSER_USER_AGENT });
    if (!html) {
      logInfo(`   [LogoRecovery] fetch failed for ${platform || "social"}: ${pageUrl}`);
      continue;
    }
    const $ = cheerio.load(html);

    add($('meta[property="og:image"]').attr("content"), { baseUrl: pageUrl, score: platformBase + 40 });
    add($('meta[property="og:image:secure_url"]').attr("content"), { baseUrl: pageUrl, score: platformBase + 40 });
    add($('meta[name="twitter:image"]').attr("content"), { baseUrl: pageUrl, score: platformBase + 24 });
    add($('meta[name="twitter:image:src"]').attr("content"), { baseUrl: pageUrl, score: platformBase + 24 });
    add($('meta[itemprop="image"]').attr("content"), { baseUrl: pageUrl, score: platformBase + 18 });
    add($('link[rel="image_src"]').attr("href"), { baseUrl: pageUrl, score: platformBase + 18 });

    $("img")
      .slice(0, 50)
      .each((_, el) => {
        const src =
          $(el).attr("src") ||
          $(el).attr("data-src") ||
          $(el).attr("data-lazy-src") ||
          pickFromSrcset($(el).attr("srcset"));
        if (!src) return;
        const hint = normalizeBceForTokenMatch(
          `${src} ${String($(el).attr("alt") || "")} ${String($(el).attr("class") || "")} ${String($(el).attr("id") || "")}`
        );
        let score = platformBase + 6;
        if (/\b(profile|avatar|logo|brand)\b/.test(hint)) score += 24;
        if (/\b(cover|banner)\b/.test(hint)) score += 6;
        add(src, { baseUrl: pageUrl, score });
      });
  }

  return Array.from(scoreByUrl.entries())
    .sort((a, b) => b[1] - a[1])
    .slice(0, 10)
    .map(([url, score]) => ({ url, score }));
}

async function tryRecoverFounderFromSnippets({ companyName, city, websiteUrl, linkedinUrl }) {
  if (!RECOVER_FOUNDER_EXTERNAL) return null;
  console.log("   Fallback founder via external snippets...");

  const domain = domainOf(websiteUrl || "");
  const queries = [
    `"${companyName}" "${city}" (fondateur OR gerant OR CEO OR founder)`,
    domain ? `site:${domain} "${companyName}" (fondateur OR gerant OR CEO)` : null,
    `"${companyName}" "${city}" site:linkedin.com/in`,
    linkedinUrl ? `"${linkedinUrl}" "${companyName}"` : null,
  ].filter(Boolean);

  const seen = new Set();
  let best = null;

  for (const q of queries) {
    let rows = [];
    try {
      rows = await searchRawResults(q, { count: 8 });
    } catch {
      rows = [];
    }

    for (const r of rows) {
      const url = r?.url || null;
      if (!url || seen.has(url)) continue;
      seen.add(url);

      const parsed = extractFounderFromText(`${r?.title || ""} ${r?.snippet || ""}`, {
        companyNameHint: companyName,
        urlHint: url,
      });
      if (!parsed?.founder_name) continue;

      let score = Number(parsed.score || 0);
      if (linkedinUrl && url === linkedinUrl) score += 3;
      if (/linkedin\.com\/in\//i.test(url)) score += 3;
      else if (/linkedin\.com/i.test(url)) score += 2;
      if (domain && domainOf(url) === domain) score += 2;

      if (!best || score > best.score) {
        best = {
          founder_name: parsed.founder_name,
          founder_role: parsed.founder_role || "Fondateur",
          founder_photo_url: null,
          score,
        };
      }
    }
  }

  if (!best) return null;
  return {
    founder_name: best.founder_name,
    founder_role: best.founder_role,
    founder_photo_url: best.founder_photo_url,
  };
}

// -------------------- CRAWL --------------------
async function fetchHtml(url, { userAgent = USER_AGENT } = {}) {
  try {
    const res = await fetchWithTimeout(
      url,
      {
        redirect: "follow",
        headers: {
          "User-Agent": userAgent,
          Accept: "text/html,application/xhtml+xml",
        },
      },
      FETCH_TIMEOUT_MS
    );
    if (!res.ok) return null;
    const ct = res.headers.get("content-type") || "";
    if (!ct.includes("text/html") && !ct.includes("application/xhtml+xml")) return null;
    return await res.text();
  } catch {
    return null;
  }
}

function extractJsonLd($) {
  const out = [];
  $('script[type="application/ld+json"]').each((_, el) => {
    const raw = $(el).contents().text();
    if (!raw) return;
    try {
      out.push(JSON.parse(raw));
    } catch {
      // ignore
    }
  });
  return out;
}

function flattenJsonLd(value) {
  const out = [];
  const stack = [value];
  while (stack.length) {
    const v = stack.pop();
    if (!v) continue;
    if (Array.isArray(v)) {
      for (const x of v) stack.push(x);
      continue;
    }
    if (typeof v !== "object") continue;
    if (v["@graph"]) {
      stack.push(v["@graph"]);
      continue;
    }
    out.push(v);
  }
  return out;
}

function pickBestStructuredBusiness(jsonlds) {
  const types = new Set([
    "LocalBusiness",
    "Organization",
    "ChildCare",
    "ProfessionalService",
    "MedicalOrganization",
    "HealthAndBeautyBusiness",
  ]);

  const flat = [];
  for (const j of jsonlds || []) flat.push(...flattenJsonLd(j));

  const candidates = [];
  for (const o of flat) {
    const t = o?.["@type"];
    if (!t) continue;
    const ok = Array.isArray(t) ? t.some((x) => types.has(x)) : types.has(t);
    if (!ok) continue;
    candidates.push(o);
  }

  if (!candidates.length) return null;

  const scoreBiz = (biz) => {
    let s = 0;
    if (biz?.name) s += 2;
    if (biz?.email) s += 2;
    if (biz?.logo) s += 1;
    if (biz?.image) s += 1;
    const addr = biz?.address;
    if (addr) s += 1;
    if (addr?.streetAddress) s += 2;
    if (addr?.postalCode) s += 1;
    if (addr?.addressLocality) s += 1;
    return s;
  };

  candidates.sort((a, b) => scoreBiz(b) - scoreBiz(a));
  return candidates[0];
}

function extractAddressFromJsonLd(biz) {
  const addr = biz?.address;
  if (!addr) return null;
  if (typeof addr === "string") return normSpaces(addr);
  const parts = [
    addr.streetAddress,
    addr.postalCode && addr.addressLocality ? `${addr.postalCode} ${addr.addressLocality}` : null,
    addr.addressRegion,
    addr.addressCountry,
  ].filter(Boolean);
  return parts.length ? normSpaces(parts.join(", ")) : null;
}

function extractPostalFromJsonLd(biz) {
  const pc = biz?.address?.postalCode;
  return pc ? String(pc).trim() : null;
}

function extractCityFromJsonLd(biz) {
  const c = biz?.address?.addressLocality;
  return c ? sanitizeCityValue(String(c).trim()) : null;
}

function extractNameFromHtml($) {
  const og = $('meta[property="og:site_name"]').attr("content");
  if (og && !looksLikeNoiseCompanyName(og)) return normSpaces(og);
  const appName = $('meta[name="application-name"]').attr("content");
  if (appName && !looksLikeNoiseCompanyName(appName)) return normSpaces(appName);
  const h1 = $("h1").first().text();
  if (h1 && !looksLikeNoiseCompanyName(h1)) return normSpaces(h1);
  return null;
}

function extractText($) {
  $("script,noscript,style,svg").remove();
  return normSpaces($("body").text());
}

function escapeRegex(s) {
  return String(s || "").replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}

function textLikelyContainsCity(text, city) {
  if (!text || !city) return false;
  const t = normalizeCityName(text);
  const c = normalizeCityName(city);
  if (!t || !c) return false;
  return t.includes(c);
}

function looksLikeStreetAddress(s) {
  const t = String(s || "").toLowerCase();
  if (!t) return false;
  const hasStreetWord =
    /\b(rue|avenue|av\.|boulevard|bd|chauss[eé]e|chemin|place|quai|route|impasse|all[ée]e|square)\b/.test(t);
  const hasNumber = /\b\d{1,4}\s*[a-z]?\b/.test(t);
  return hasStreetWord && hasNumber;
}

function extractBePostalCityFromText(text, cityHint) {
  const t = String(text || "");
  if (!t) return null;

  // Belgium postal codes are 4 digits; extract "4000 Liège" like patterns.
  // The city group stops at digits, 2+ spaces, comma, or other punctuation to avoid over-capture.
  const re = /\b([1-9]\d{3})\s+(\p{L}[\p{L}'\-]{0,25}(?:\s[\p{L}'\-]{1,25}){0,3})(?=[\s,;:.\d]|$)/gu;
  const candidates = [];
  let m;
  while ((m = re.exec(t)) !== null) {
    const postal_code = m[1];
    const city = sanitizeCityValue(normSpaces(m[2]).replace(/[.,;:]+$/, ""));
    if (!city) continue;
    candidates.push({ postal_code, city, index: m.index });
    if (candidates.length >= 30) break;
  }

  if (!candidates.length) return null;
  if (cityHint) {
    const hit = candidates.find((x) => sameCity(x.city, cityHint));
    if (hit) return hit;
  }
  return candidates[0];
}

function isLikelyBePostalCodeForMarket(postal_code) {
  if (!postal_code) return false;
  const n = parseInt(String(postal_code), 10);
  if (!Number.isFinite(n)) return false;
  if (n < 1000 || n > 9999) return false;

  // Heuristic: for BE-WAL runs, avoid common false positives like "2024 ..." (years)
  // and reduce Flanders-only postcodes when scraping Wallonia.
  if (String(MARKET || "").toUpperCase() === "BE-WAL") {
    if (n >= 2000 && n < 4000) return false;
  }

  return true;
}

function beProvinceFromPostalCode(postal_code) {
  const n = parseInt(String(postal_code || ""), 10);
  if (!Number.isFinite(n) || n < 1000 || n > 9999) return null;

  // Regions / provinces by postal code range (good enough for filtering "base location").
  // Brussels-Capital Region
  if (n >= 1000 && n <= 1299) return "Bruxelles";

  // Walloon Brabant (avoid confusing with Brussels 1xxx)
  if (n >= 1300 && n <= 1499) return "Brabant wallon";

  // Antwerp
  if (n >= 2000 && n <= 2999) return "Anvers";

  // Flemish Brabant (15xx-19xx + 30xx-34xx) and Limburg (35xx-39xx)
  if ((n >= 1500 && n <= 1999) || (n >= 3000 && n <= 3499)) return "Brabant flamand";
  if (n >= 3500 && n <= 3999) return "Limbourg";

  // Wallonia
  if (n >= 4000 && n <= 4999) return "Liege";
  if (n >= 5000 && n <= 5999) return "Namur";
  if (n >= 6600 && n <= 6999) return "Luxembourg";
  if ((n >= 6000 && n <= 6599) || (n >= 7000 && n <= 7999)) return "Hainaut";

  // Flanders
  if (n >= 8000 && n <= 8999) return "Flandre occidentale";
  if (n >= 9000 && n <= 9999) return "Flandre orientale";

  return null;
}

function beRegionFromProvince(province) {
  const p = normalizeCityName(province);
  if (!p) return null;
  if (p === normalizeCityName("Bruxelles")) return "BXL";
  if (
    p === normalizeCityName("Liege") ||
    p === normalizeCityName("Namur") ||
    p === normalizeCityName("Hainaut") ||
    p === normalizeCityName("Luxembourg") ||
    p === normalizeCityName("Brabant wallon")
  ) {
    return "WAL";
  }
  return "VLG";
}

function beRegionFromPostalCode(postal_code) {
  const prov = beProvinceFromPostalCode(postal_code);
  return prov ? beRegionFromProvince(prov) : null;
}

function beProvinceFromCityHint(cityHint) {
  const k = normalizeCityName(cityHint);
  if (!k) return null;

  // If the hint is already a province/region label, accept it.
  const direct = new Map([
    ["liege", "Liege"],
    ["namur", "Namur"],
    ["hainaut", "Hainaut"],
    ["luxembourg", "Luxembourg"],
    ["brabant wallon", "Brabant wallon"],
    ["bruxelles", "Bruxelles"],
    ["brussels", "Bruxelles"],
  ]);
  if (direct.has(k)) return direct.get(k);

  // Config override mapping (recommended).
  for (const [city, prov] of Object.entries(CITY_PROVINCES || {})) {
    if (normalizeCityName(city) === k) return normSpaces(prov) || null;
  }

  return null;
}

function addressSnippetAround(text, index) {
  const t = String(text || "");
  if (!t) return null;
  const i = Number.isFinite(index) ? index : 0;
  const start = Math.max(0, i - 120);
  const end = Math.min(t.length, i + 160);
  const snippet = normSpaces(t.slice(start, end));
  if (!snippet) return null;
  return snippet.length > 180 ? snippet.slice(0, 180) : snippet;
}

function extractLocationBlob($) {
  const selectors = [
    "address",
    "footer",
    "header",
    '[class*="contact"]',
    '[id*="contact"]',
    '[class*="adresse"]',
    '[id*="adresse"]',
    '[class*="address"]',
    '[id*="address"]',
    '[class*="localisation"]',
    '[id*="localisation"]',
  ];

  const parts = [];
  for (const sel of selectors) {
    try {
      const txt = $(sel).text();
      if (txt) parts.push(txt);
    } catch {
      // ignore
    }
  }
  return parts.join("\n");
}

function extractCandidateLinks($, baseUrl) {
  const links = [];
  $("a[href]").each((_, el) => {
    const href = $(el).attr("href");
    const abs = absolutize(href, baseUrl);
    if (!abs) return;
    try {
      const u = new URL(abs);
      const b = new URL(baseUrl);
      if (u.hostname !== b.hostname) return;
      if (u.hash) u.hash = "";
      const s = u.toString();
      if (/\.(pdf|jpg|jpeg|png|webp|gif|zip)$/i.test(s)) return;
      links.push(s);
    } catch {
      // ignore
    }
  });
  return Array.from(new Set(links));
}

function scoreLink(url) {
  const s = url.toLowerCase();
  let score = 0;
  const boosts = [
    "contact",
    "mentions",
    "legales",
    "legal",
    "cgv",
    "cgu",
    "privacy",
    "confidentialite",
    "a-propos",
    "about",
    "equipe",
    "team",
    "services",
    "prestations",
    "tarifs",
    "methode",
    "approche",
    "philosophie",
    "galerie",
    "photos",
    "impressum",
    "tva",
    "vat",
    "entreprise",
    "company",
  ];
  for (const b of boosts) if (s.includes(b)) score += 10;
  score += Math.max(0, 8 - s.split("/").length);
  return score;
}

async function tryReadSitemap(startUrl) {
  const base = new URL(startUrl).origin;
  const candidates = [`${base}/sitemap.xml`, `${base}/sitemap_index.xml`];
  for (const u of candidates) {
    try {
      const res = await fetchWithTimeout(
        u,
        { redirect: "follow", headers: { "User-Agent": USER_AGENT } },
        FETCH_TIMEOUT_MS
      );
      if (!res.ok) continue;
      const xml = await res.text();
      const urls = Array.from(xml.matchAll(/<loc>(.*?)<\/loc>/gi)).map((m) => m[1]);
      const clean = urls
        .map((x) => safeUrl(x))
        .filter((x) => x && domainOf(x) === domainOf(startUrl))
        .filter((x) => !/\.(pdf|jpg|jpeg|png|webp|gif|zip)$/i.test(x));
      if (clean.length) return clean.slice(0, 120);
    } catch {
      // ignore
    }
  }
  return [];
}

async function crawlSite(startUrl, { cityHint } = {}) {
  const visited = new Set();
  const pages = [];
  const texts = [];
  const emails = new Set();
  const phonesTel = new Set();
  const phonesJsonld = new Set();
  const phonesText = new Set();
  const bces = new Set();
  const imageCandidates = new Map();
  const logoCandidates = new Map();
  const priceRangeCandidates = new Set();
  const socialCandidates = {
    facebook: new Set(),
    instagram: new Set(),
    linkedin: new Set(),
    tiktok: new Set(),
  };
  const detectedLanguages = new Set();
  const googleReviewKeys = new Set();
  const googleReviews = [];
  let devisGratuitFound = false;
  let openingHoursBest = null;
  let contact_name = null;
  let founder_name = null;
  let founder_role = null;
  let founder_photo_url = null;
  let google_rating = null;
  let google_reviews_count = null;
  let availability = null;

  // Text-extracted location hints (sites often put these only in header/footer).
  let textPostal = null;
  let textCity = null;
  let textAddress = null;

  let bestBiz = null;
  let bestBizScore = -1;
  let bestBizBaseUrl = startUrl;

  const base = new URL(startUrl).origin;
  const commonPaths = [
    "/",
    "/contact",
    "/contact/",
    "/mentions-legales",
    "/mentions-legales/",
    "/mentions-legales.html",
    "/legal",
    "/legal-notice",
    "/privacy",
    "/a-propos",
    "/about",
    "/services",
    "/prestations",
    "/tarifs",
  ]
    .map((p) => safeUrl(base + p))
    .filter(Boolean);

  const sitemapUrls = await tryReadSitemap(startUrl);

  const addLogoCandidate = (u, bonus = 0, refUrl = startUrl) => {
    const abs = absolutize(u, refUrl);
    if (!abs || isBadLogoUrl(abs)) return;
    const prev = logoCandidates.get(abs) || { bonus: 0 };
    logoCandidates.set(abs, { bonus: prev.bonus + Number(bonus || 0) });
  };

  const addLogoFromValue = (value, bonus = 0, refUrl = startUrl) => {
    if (!value) return;
    if (typeof value === "string") {
      addLogoCandidate(value, bonus, refUrl);
      return;
    }
    if (Array.isArray(value)) {
      for (const x of value.slice(0, 8)) {
        if (typeof x === "string") addLogoCandidate(x, bonus, refUrl);
        else if (x && typeof x === "object") addLogoFromValue(x, bonus, refUrl);
      }
      return;
    }
    if (typeof value === "object") {
      addLogoFromValue(value.url || value.contentUrl || value.image || null, bonus, refUrl);
    }
  };

  const queue = [];
  function push(u, bonus = 0) {
    if (!u) return;
    if (visited.has(u)) return;
    queue.push({ url: u, prio: -(scoreLink(u) + bonus) });
  }

  push(startUrl, 20);
  for (const u of commonPaths) push(u, 15);
  for (const u of sitemapUrls) push(u, 5);

  while (queue.length && pages.length < MAX_PAGES_PER_SITE) {
    queue.sort((a, b) => a.prio - b.prio);
    const { url } = queue.shift();
    if (visited.has(url)) continue;
    visited.add(url);

    logDebug(`   [CrawlPage] ${pages.length + 1}/${MAX_PAGES_PER_SITE}: ${url}`);

    const html = await fetchHtml(url);
    if (!html) continue;

    const $ = cheerio.load(html);
    const jsonldsRaw = extractJsonLd($);
    const flatJsonlds = [];
    for (const j of jsonldsRaw) flatJsonlds.push(...flattenJsonLd(j));

    const biz = pickBestStructuredBusiness(jsonldsRaw);
    if (biz) {
      const scoreBiz = (() => {
        let s = 0;
        if (biz?.name) s += 2;
        if (biz?.email) s += 2;
        if (biz?.logo) s += 1;
        if (biz?.image) s += 1;
        const addr = biz?.address;
        if (addr) s += 1;
        if (addr?.streetAddress) s += 2;
        if (addr?.postalCode) s += 1;
        if (addr?.addressLocality) s += 1;
        return s;
      })();
      if (scoreBiz > bestBizScore) {
        bestBiz = biz;
        bestBizScore = scoreBiz;
        bestBizBaseUrl = url;
      }

      addLogoFromValue(biz?.logo, 18, url);
    }

    const pageText = extractText($);
    texts.push(`URL: ${url}\n${pageText}\n`);

    const locBlob = (() => {
      try {
        return extractLocationBlob($) || pageText;
      } catch {
        return pageText;
      }
    })();

    detectLanguagesFromText(`${pageText}\n${locBlob}`).forEach((lang) => detectedLanguages.add(lang));
    if (!availability) availability = detectAvailabilityFromText(`${pageText}\n${locBlob}`);

    if (!contact_name) {
      const fromJsonLd = extractContactNameFromJsonLd(flatJsonlds, {
        companyNameHint: biz?.name || bestBiz?.name || null,
      });
      if (fromJsonLd) contact_name = fromJsonLd;
    }

    // Location hints from header/footer/contact blocks (not only JSON-LD).
    try {
      const addrTagRaw = $("address").first().text();
      const addrTag = normSpaces(addrTagRaw);
      const addrPc = addrTag ? extractBePostalCityFromText(addrTag, null) : null;

      if (!contact_name && addrTagRaw) {
        const fromAddress = extractContactNameFromAddressText(addrTagRaw, {
          companyNameHint: biz?.name || bestBiz?.name || null,
        });
        if (fromAddress) contact_name = fromAddress;
      }

      // Prefer extracting CP/ville from the explicit <address> block when present
      // (it is typically the business physical location, vs. marketing pages that mention big cities).
      if (addrPc && isLikelyBePostalCodeForMarket(addrPc.postal_code)) {
        if (!textPostal) textPostal = addrPc.postal_code || null;
        if (!textCity) textCity = sanitizeCityValue(addrPc.city) || null;
      }

      if (!textAddress && addrTag && (looksLikeStreetAddress(addrTag) || addrPc)) {
        textAddress = addrTag;
      }

      const hit = extractBePostalCityFromText(locBlob, null);

      // If we have a "CP Ville" candidate, use it as city/postal source.
      // This is important for sites that target a big city in SEO, but have a physical address elsewhere.
      if (hit && isLikelyBePostalCodeForMarket(hit.postal_code)) {
        if (!textPostal) textPostal = hit.postal_code || null;
        if (!textCity) textCity = sanitizeCityValue(hit.city) || null;
      }

      if (!textAddress) {
        if (hit) {
          const snip = addressSnippetAround(locBlob, hit.index);
          if (looksLikeStreetAddress(snip)) textAddress = snip;

          const pc = snip ? extractBePostalCityFromText(snip, null) : null;
          if (pc && isLikelyBePostalCodeForMarket(pc.postal_code)) {
            if (!textPostal) textPostal = pc.postal_code || null;
            if (!textCity) textCity = sanitizeCityValue(pc.city) || null;
          }
        }
      }
    } catch {
      // ignore
    }

    // pricing hints + structured extras (optional)
    for (const o of flatJsonlds) {
      const pr = o?.priceRange;
      if (typeof pr === "string") priceRangeCandidates.add(normSpaces(pr));

      const sameAs = o?.sameAs;
      if (typeof sameAs === "string") {
        const social = classifySocialUrl(sameAs, url);
        if (social) socialCandidates[social.platform].add(social.url);
      } else if (Array.isArray(sameAs)) {
        for (const x of sameAs) {
          const social = classifySocialUrl(x, url);
          if (social) socialCandidates[social.platform].add(social.url);
        }
      }

      if (typeof o?.url === "string") {
        const social = classifySocialUrl(o.url, url);
        if (social) socialCandidates[social.platform].add(social.url);
      }
    }
    if (typeof biz?.priceRange === "string") priceRangeCandidates.add(normSpaces(biz.priceRange));
    if (!devisGratuitFound && /\bdevis\s+(gratuit|offert)\b/i.test(pageText)) devisGratuitFound = true;

    const pageOpeningHoursJsonLd = extractOpeningHoursFromJsonLd(flatJsonlds);
    const pageOpeningHoursText = RECOVER_OPENING_HOURS ? extractOpeningHoursFromText(`${locBlob}\n${pageText}`) : null;
    const pageOpeningHours = pickBestOpeningHours(pageOpeningHoursJsonLd, pageOpeningHoursText);
    openingHoursBest = pickBestOpeningHours(openingHoursBest, pageOpeningHours);

    const founderFromJsonLd = extractFounderFromJsonLd(flatJsonlds, url);
    if (founderFromJsonLd?.founder_name && !founder_name) {
      founder_name = founderFromJsonLd.founder_name;
      founder_role = founderFromJsonLd.founder_role;
    }
    if (founderFromJsonLd?.founder_photo_url && !founder_photo_url) {
      founder_photo_url = founderFromJsonLd.founder_photo_url;
    }

    if (RECOVER_FOUNDER) {
      const founderFromText = extractFounderFromText(`${$("title").text()} ${pageText}`, {
        companyNameHint: biz?.name || null,
        urlHint: url,
      });
      if (founderFromText?.founder_name && !founder_name) {
        founder_name = founderFromText.founder_name;
        founder_role = founderFromText.founder_role || founder_role || "Fondateur";
      }
      if (!founder_photo_url) {
        const founderNameForPhoto = founder_name || founderFromText?.founder_name || null;
        const fromPage = extractFounderPhotoFromPage($, url, founderNameForPhoto);
        if (fromPage) founder_photo_url = fromPage;
      }
    }

    const googleSignals = extractGoogleSignalsFromJsonLd(flatJsonlds);
    if (googleSignals.google_rating !== null) {
      const pageCount = googleSignals.google_reviews_count ?? -1;
      const bestCount = google_reviews_count ?? -1;
      if (google_rating === null || pageCount > bestCount) {
        google_rating = googleSignals.google_rating;
        google_reviews_count = googleSignals.google_reviews_count;
      }
    }
    for (const r of googleSignals.google_reviews || []) {
      const key = `${r.author || ""}|${r.publishedAt || ""}|${String(r.text || "").slice(0, 120)}`;
      if (googleReviewKeys.has(key)) continue;
      googleReviewKeys.add(key);
      googleReviews.push(r);
      if (googleReviews.length >= 10) break;
    }

    // phones (optional)
    extractPhonesFromTelLinks($).forEach((p) => phonesTel.add(p));
    extractPhonesFromJsonLd(flatJsonlds).forEach((p) => phonesJsonld.add(p));
    extractPhonesFromText(locBlob).forEach((p) => phonesText.add(p));
    extractPhonesFromText(pageText).forEach((p) => phonesText.add(p));

    extractEmailsFromText(pageText).forEach((e) => {
      const n = normalizeEmail(e);
      if (n) emails.add(n);
    });
    extractEmailsFromMailto($).forEach((e) => {
      const n = normalizeEmail(e);
      if (n) emails.add(n);
    });
    extractEmailsFromJsonLd(flatJsonlds).forEach((e) => {
      const n = normalizeEmail(e);
      if (n) emails.add(n);
    });

    extractBceCandidates(pageText).forEach((b) => bces.add(b));

    const og = $('meta[property="og:image"]').attr("content");
    const tw = $('meta[name="twitter:image"]').attr("content");
    const imgFromBiz = biz?.image || biz?.logo;

    const addImg = (u, bonus = 0, semantic = 0) => {
      const abs = absolutize(u, url);
      if (!abs || isBadImageUrl(abs)) return;
      const prev = imageCandidates.get(abs) || { bonus: 0, semantic: 0 };
      imageCandidates.set(abs, { bonus: prev.bonus + bonus, semantic: prev.semantic + semantic });
    };

    if (og) addImg(og, 30, 4);
    if (tw) addImg(tw, 10, 2);
    if (typeof imgFromBiz === "string") addImg(imgFromBiz, 20, 2);
    if (Array.isArray(imgFromBiz)) imgFromBiz.slice(0, 10).forEach((x) => typeof x === "string" && addImg(x, 10, 1));

    for (const logoCand of extractLogoCandidatesFromPage($, url)) {
      addLogoCandidate(logoCand.url, logoCand.bonus, url);
    }

    $("img").each((_, el) => {
      const src =
        $(el).attr("src") ||
        $(el).attr("data-src") ||
        $(el).attr("data-lazy-src") ||
        pickFromSrcset($(el).attr("srcset"));
      if (!src) return;

      const alt = String($(el).attr("alt") || "");
      const title = String($(el).attr("title") || "");
      const cls = String($(el).attr("class") || "");
      const id = String($(el).attr("id") || "");
      const contextHint = [
        $(el).closest("section,article,main,header,footer,nav").attr("class") || "",
        $(el).closest("section,article,main,header,footer,nav").attr("id") || "",
        $(el).parent().attr("class") || "",
      ].join(" ");
      const semantic = imageSemanticScore({ src, alt, title, cls, id, pageUrl: url, contextHint });

      let bonus = 1;
      if ($(el).closest("main,article,.content,.entry,.portfolio,.projects,.gallery").length) bonus += 4;
      if ($(el).closest(".hero,.banner,[class*='hero'],[id*='hero']").length) bonus += 5;
      if ($(el).closest("header,nav,footer,.site-header,.navbar,.menu").length) bonus -= 8;

      addImg(src, bonus, semantic);
    });
    $("source").each((_, el) => {
      const src = pickFromSrcset($(el).attr("srcset") || $(el).attr("data-srcset"));
      if (!src) return;
      addImg(src, 2, 1);
    });
    $("[style]").each((_, el) => {
      const style = String($(el).attr("style") || "");
      if (!style) return;
      const re = /url\(\s*['"]?([^'")<>]+\.(?:avif|webp|png|jpe?g|gif))(?:\?[^'")<>]*)?['"]?\s*\)/gi;
      let m;
      while ((m = re.exec(style)) !== null) {
        addImg(m[1], 2, 1);
      }
    });
    if (imageCandidates.size < 4) {
      for (const rawUrl of extractImageCandidateUrlsFromHtmlRaw(html, url)) {
        addImg(rawUrl, 1, 0);
      }
    }
    $("a[href]").each((_, el) => {
      const social = classifySocialUrl($(el).attr("href"), url);
      if (social) socialCandidates[social.platform].add(social.url);
    });

    const links = extractCandidateLinks($, url).slice(0, 120);
    for (const l of links) push(l, 0);

    pages.push({ url, html });
  }

  let name = null;
  let address = null;
  let postal = null;
  let city = null;
  let logo = null;

  try {
    if (bestBiz) {
      name = bestBiz.name ? normSpaces(bestBiz.name) : null;
      address = extractAddressFromJsonLd(bestBiz);
      postal = extractPostalFromJsonLd(bestBiz);
      city = extractCityFromJsonLd(bestBiz);
      if (bestBiz.logo && typeof bestBiz.logo === "string") logo = absolutize(bestBiz.logo, bestBizBaseUrl);
    }

    if (!name) {
      const html0 = pages.length ? pages[0].html : null;
      if (html0) {
        const $0 = cheerio.load(html0);
        name = extractNameFromHtml($0) || null;
      }
    }
  } catch {
    // ignore
  }

  const corpus = texts.join("\n").slice(0, MAX_CORPUS_CHARS);
  const phonePick = (() => {
    for (const p of phonesTel) return { phone: p, source: "tel_link" };
    for (const p of phonesJsonld) return { phone: p, source: "jsonld" };
    for (const p of phonesText) return { phone: p, source: "text" };
    return { phone: null, source: null };
  })();
  const public_phone = phonePick.phone || null;
  const public_phone_source = phonePick.source || null;
  const priceRange = (() => {
    if (bestBiz && typeof bestBiz.priceRange === "string") return normSpaces(bestBiz.priceRange);
    for (const pr of priceRangeCandidates) return pr;
    return null;
  })();
  const pickFirst = (set) => {
    for (const v of set || []) {
      if (v) return v;
    }
    return null;
  };
  const languages = (() => {
    const list = new Set(["français"]);
    for (const l of detectedLanguages) list.add(l);
    return Array.from(list);
  })();
  const pickedLogo = await pickBestLogoUrl(
    Array.from(logoCandidates.entries()).map(([url, meta]) => ({ url, bonus: meta.bonus })),
    logo ? safeUrl(logo) : null
  );
  const logoCandidatesRanked = Array.from(logoCandidates.entries())
    .sort((a, b) => Number(b?.[1]?.bonus || 0) - Number(a?.[1]?.bonus || 0))
    .map(([url]) => safeUrl(url))
    .filter(Boolean);

  return {
    base,
    name: name || null,
    address: sanitizeAddressText(address || textAddress || null),
    postal_code: textPostal || postal || null,
    city: sanitizeCityValue(textCity || city || null),
    public_phone,
    public_phone_source,
    emails: Array.from(emails),
    bces: Array.from(bces),
    imageCandidates: Array.from(imageCandidates.entries()).map(([url, meta]) => ({
      url,
      bonus: meta.bonus,
      semantic: meta.semantic || 0,
    })),
    corpus,
    priceRange,
    devisGratuit: devisGratuitFound ? true : null,
    facebook: pickFirst(socialCandidates.facebook),
    instagram: pickFirst(socialCandidates.instagram),
    linkedin: pickFirst(socialCandidates.linkedin),
    tiktok: pickFirst(socialCandidates.tiktok),
    contact_name,
    founder_name,
    founder_role,
    founder_photo_url,
    ideal_zone: cityHint || null,
    languages,
    availability: availability || null,
    opening_hours: sanitizeOpeningHours(openingHoursBest),
    google_rating,
    google_reviews_count,
    google_reviews: googleReviews,
    pagesVisited: pages.length,
    logoUrl: pickedLogo ? safeUrl(pickedLogo) : null,
    logoCandidates: logoCandidatesRanked.slice(0, 20),
  };
}

// -------------------- LLM (OLLAMA) --------------------
async function ollamaJson(schema, prompt, { temperature = null } = {}) {
  const resolvedTemperature = Number.isFinite(Number(temperature)) ? Number(temperature) : OLLAMA_TEMPERATURE;
  const body = {
    model: OLLAMA_MODEL,
    messages: [{ role: "user", content: prompt }],
    format: schema,
    options: { temperature: resolvedTemperature, num_ctx: OLLAMA_NUM_CTX },
    stream: false,
  };

  const isRetryableStatus = (status) => status === 408 || status === 429 || status >= 500;
  const isAbortLikeError = (err) => {
    const name = String(err?.name || "");
    const msg = String(err?.message || "").toLowerCase();
    return name === "AbortError" || msg.includes("aborted") || msg.includes("timeout");
  };

  const maxRetries = 2;
  for (let attempt = 0; attempt <= maxRetries; attempt++) {
    let res;
    try {
      res = await fetchWithTimeout(
        `${OLLAMA_URL}/api/chat`,
        {
          method: "POST",
          headers: { "Content-Type": "application/json" },
          body: JSON.stringify(body),
        },
        OLLAMA_TIMEOUT_MS
      );
    } catch (err) {
      if (attempt < maxRetries && isAbortLikeError(err)) {
        const wait = 1500 * (attempt + 1);
        console.warn(`   Ollama timeout/abort; retry in ${wait}ms...`);
        await sleep(wait);
        continue;
      }
      return null;
    }

    if (!res.ok) {
      if (attempt < maxRetries && isRetryableStatus(res.status)) {
        const wait = 1500 * (attempt + 1);
        console.warn(`   Ollama HTTP ${res.status}; retry in ${wait}ms...`);
        await sleep(wait);
        continue;
      }
      return null;
    }

    let json = null;
    try {
      json = await res.json();
    } catch {
      return null;
    }

    const content = json?.message?.content;
    if (!content) return null;
    try {
      return JSON.parse(content);
    } catch {
      return null;
    }
  }

  return null;
}

function buildDescription(oneLiner, services, zone, firstContact, piliers) {
  // Use the LLM one_liner directly as the public description — no template appends.
  // Services, zone, etc. are stored in their own DB columns (services, ideal_zone, …)
  // and displayed separately; injecting them here with rigid labels creates template noise.
  const oneLinerSafe = sanitizePublicFreeText(oneLiner) || "Presentation a completer lors de la revendication.";
  return normSpaces(oneLinerSafe);
}

function stripEmailsFromFreeText(s) {
  if (s === null || s === undefined) return null;
  return String(s).replace(/[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}/gi, " ");
}

function stripPhonesFromFreeText(s) {
  if (s === null || s === undefined) return null;
  let v = String(s);

  // Remove only patterns that normalize to a BE phone; avoid killing postal codes / random numbers.
  const re = /(?:\+32|0032|0)\s*(?:\(0\)\s*)?(?:\d[\s.\-()/]{0,3}){7,14}\d/g;
  v = v.replace(re, (m) => (normalizeBePhone(m) ? " " : m));

  return v;
}

function stripPriceLikeFromFreeText(s) {
  if (s === null || s === undefined) return null;
  let v = String(s);
  // Remove explicit amounts to avoid turning bios/metas into a "comparator".
  v = v.replace(/\b\d[\d\s.,]{0,10}\s*(â‚¬|eur|euros)\b/gi, " ");
  v = v.replace(/\b(â‚¬|eur|euros)\s*\d[\d\s.,]{0,10}\b/gi, " ");
  return v;
}

function sanitizePublicFreeText(s) {
  if (s === null || s === undefined) return null;
  let v = String(s);
  v = stripEmailsFromFreeText(v);
  v = stripPhonesFromFreeText(v);
  v = stripPriceLikeFromFreeText(v);
  v = v.replace(/\brevelys\b/gi, " ");
  v = normSpaces(v);
  return v || null;
}

function formatListFr(items) {
  const list = (Array.isArray(items) ? items : []).map((x) => normSpaces(x)).filter(Boolean);
  if (!list.length) return "";
  if (list.length === 1) return list[0];
  if (list.length === 2) return `${list[0]} et ${list[1]}`;
  return `${list.slice(0, -1).join(", ")} et ${list[list.length - 1]}`;
}

const SERVICE_FEMININE_HEADS = new Set([
  // generic operations
  "reparation",
  "installation",
  "maintenance",
  "inspection",
  "mise",
  "pose",
  "renovation",
  "intervention",
  "verification",
  "assistance",
  "analyse",
  "etude",
  "evaluation",
  "expertise",
  "consultation",
  "coordination",
  "preparation",
  "selection",
  "planification",
  "fabrication",
  "decoration",
  "restauration",
  "livraison",
  "location",
  "formation",
  "gestion",
  "surveillance",
  "desinfection",
  "sterilisation",
  "securisation",
  "detection",
  "evacuation",
  "isolation",
  "ventilation",
  "climatisation",
  "plomberie",
  "toiture",
  "vitrerie",
  "serrurerie",
  "menuiserie",
  "maconnerie",
  "peinture",
  "electricite",
  "humidite",
  "depollution",
  "sanitisation",
  "signalisation",
  "mediation",
  // care / wellness / health
  "coiffure",
  "esthetique",
  "manucure",
  "pedicure",
  "epilation",
  "massotherapie",
  "kinesitherapie",
  "orthopedie",
  "nutrition",
  "psychotherapie",
  "therapie",
  // admin / business
  "comptabilite",
  "fiscalite",
  "traduction",
  "communication",
  "conception",
  "integration",
  "migration",
  "automatisation",
]);

const SERVICE_MASCULINE_HEADS = new Set([
  "debouchage",
  "depannage",
  "diagnostic",
  "entretien",
  "controle",
  "traitement",
  "nettoyage",
  "curage",
  "ramonage",
  "elagage",
  "poncage",
  "relevage",
  "chauffage",
  "forage",
  "carottage",
  "terrassement",
  "amenagement",
  "demenagement",
  "gardiennage",
  "stockage",
  "transport",
  "montage",
  "demontage",
  "calibrage",
  "reglage",
  "parametrage",
  "developpement",
  "hebergement",
  "referencement",
  "accompagnement",
  "conseil",
  "suivi",
  "coaching",
  "devis",
]);

const SERVICE_PLURAL_HEADS = new Set([
  "travaux",
  "prestations",
  "services",
  "interventions",
  "diagnostics",
  "reparations",
  "installations",
  "maintenances",
  "depannages",
  "amenagements",
  "finitions",
  "conseils",
]);

const SERVICE_FEMININE_SUFFIXES = [
  "tion",
  "sion",
  "aison",
  "ison",
  "ance",
  "ence",
  "erie",
  "euse",
  "eure",
  "ette",
  "esse",
  "ette",
  "ude",
  "ure",
  "ie",
  "ite",
  "té",
];

const SERVICE_SINGULAR_S_EXCEPTIONS = new Set(["devis", "avis", "prix", "surplus"]);

function startsWithVowelOrMuteH(raw) {
  const n = normalizeCityName(raw);
  if (!n) return false;
  return /^[aeiouyh]/.test(n);
}

function isLikelyPluralHead(firstNorm) {
  if (!firstNorm) return false;
  if (SERVICE_PLURAL_HEADS.has(firstNorm)) return true;
  if (SERVICE_SINGULAR_S_EXCEPTIONS.has(firstNorm)) return false;
  return firstNorm.length >= 5 && /[sx]$/.test(firstNorm);
}

function isLikelyFeminineHead(firstNorm) {
  if (!firstNorm) return false;
  if (SERVICE_FEMININE_HEADS.has(firstNorm)) return true;
  if (SERVICE_MASCULINE_HEADS.has(firstNorm)) return false;
  return SERVICE_FEMININE_SUFFIXES.some((suf) => firstNorm.endsWith(normalizeCityName(suf)));
}

function serviceWithDefiniteArticleFr(raw) {
  const s = normSpaces(String(raw || "")).toLowerCase();
  if (!s) return null;

  if (/^(le|la|les|l'|du|de la|de l'|des)\b/.test(s)) return s;

  const firstWord = s.split(/[\s-]+/)[0] || "";
  const firstNorm = normalizeCityName(firstWord);

  if (isLikelyPluralHead(firstNorm)) return `les ${s}`;
  if (startsWithVowelOrMuteH(firstWord)) return `l'${s}`;
  if (isLikelyFeminineHead(firstNorm)) return `la ${s}`;
  return `le ${s}`;
}

function shortActivityLabel(industry, services) {
  const ind = normalizeCityName(industry);
  if (ind.includes("plomb")) return "plomberie";
  if (ind.includes("electri")) return "électricité";
  if (ind.includes("chauff")) return "chauffage";
  if (ind.includes("serrur")) return "serrurerie";
  if (ind.includes("toit") || ind.includes("couvr")) return "toiture";
  if (ind.includes("nettoy")) return "nettoyage";
  if (ind.includes("jardin")) return "jardinage";
  if (ind.includes("menuis")) return "menuiserie";
  if (ind.includes("vitrier")) return "vitrerie";
  if (ind.includes("facad") || ind.includes("crepi")) return "façade";
  if (ind.includes("isolation") || ind.includes("humid")) return "isolation et traitement de l'humidité";
  if (ind.includes("renov") || ind.includes("entrepreneur")) return "rénovation";
  if (ind.includes("demenag") || ind.includes("garde")) return "déménagement";
  if (ind.includes("alarme") || ind.includes("camera") || ind.includes("securit")) return "sécurité";

  const s0 = (Array.isArray(services) ? services[0] : null) || null;
  if (s0) return normSpaces(String(s0)).toLowerCase();

  // Fallback to a cleaned industry label.
  const fallback = normSpaces(String(industry || "")).replace(/\s*\/\s*/g, " et ").toLowerCase();
  return fallback || "services";
}

const TRAILING_CONNECTOR_RE = /\b(?:et|ou|avec|de|du|des|a|à|au|aux|pour|sur|dans|en|via)\s*$/i;

function finalizeNarrativeSentenceBlock(raw, { maxLen = 420 } = {}) {
  let s = clampLine(sanitizePublicFreeText(raw), maxLen);
  if (!s) return null;

  if (!/[.!?]$/.test(s)) {
    const lastStop = Math.max(s.lastIndexOf(". "), s.lastIndexOf("! "), s.lastIndexOf("? "));
    if (lastStop >= Math.floor(s.length * 0.55)) {
      s = s.slice(0, lastStop + 1).trim();
    }
  }

  s = s.replace(/[,:;]\s*$/g, "").trim();
  s = s.replace(TRAILING_CONNECTOR_RE, "").trim();
  if (!/[.!?]$/.test(s)) s = `${s}.`;
  return s;
}

function repeatedNgramsRatio(text, { n = 3 } = {}) {
  const t = normalizeBceForTokenMatch(text);
  if (!t) return 0;
  const words = t.split(/\s+/).filter(Boolean);
  if (words.length < n * 2) return 0;
  const map = new Map();
  for (let i = 0; i <= words.length - n; i++) {
    const g = words.slice(i, i + n).join(" ");
    map.set(g, (map.get(g) || 0) + 1);
  }
  const repeated = Array.from(map.values()).filter((v) => v > 1).reduce((acc, v) => acc + (v - 1), 0);
  return repeated / Math.max(1, words.length - n + 1);
}

function oneLinerLooksWeak(raw) {
  const t = normalizeBceForTokenMatch(raw);
  if (!t) return true;
  if (repeatedNgramsRatio(t, { n: 3 }) > 0.2) return true;
  if (/\best\s+une\s+entreprise\s+de\b/.test(t)) return true;
  if (/\best\s+une\s+entreprise\b/.test(t) && /\bactive\s+a\b/.test(t)) return true;
  if (/\bnumero\s*1\b/.test(t)) return true;
  if (/revelys/.test(t)) return true;
  return false;
}

const COMPANY_NAME_STOP_TOKENS = new Set([
  "srl",
  "sprl",
  "sa",
  "asbl",
  "sc",
  "scrl",
  "snc",
  "sr",
  "srlu",
  "bv",
  "bvba",
  "ltd",
  "inc",
  "groupe",
  "group",
  "holding",
  "company",
  "entreprise",
  "services",
  "service",
  "et",
  "de",
  "du",
  "des",
  "la",
  "le",
  "les",
]);

function significantCompanyNameTokens(name, maxTokens = 4) {
  const norm = normalizeBceForTokenMatch(name);
  if (!norm) return [];
  const out = [];
  for (const token of norm.split(" ").filter(Boolean)) {
    if (token.length < 2) continue; // allow 2-char abbreviations like "DM", "AB", etc.
    if (/^\d+$/.test(token)) continue;
    if (COMPANY_NAME_STOP_TOKENS.has(token)) continue;
    if (out.includes(token)) continue;
    out.push(token);
    if (out.length >= maxTokens) break;
  }
  return out;
}

function descriptionContainsCompanyName(rawDescription, companyName) {
  const text = normalizeBceForTokenMatch(rawDescription);
  if (!text) return false;
  const tokens = significantCompanyNameTokens(companyName, 4);
  if (!tokens.length) return true;
  return tokens.some((token) => new RegExp(`\\b${escapeRegex(token)}\\b`).test(text));
}

function isPublishableAddressStreet(addressStreet) {
  const clean = normSpaces(String(addressStreet || ""));
  if (!clean) return false;
  const norm = normalizeBceForTokenMatch(clean);
  if (!norm) return false;
  if (/^(be|belgique|belgium)$/.test(norm)) return false;
  const tokens = norm.split(" ").filter(Boolean);
  if (tokens.length < 2) return false;
  if (!/[a-z]/.test(norm)) return false;
  return true;
}

function buildFallbackSeedDescriptionLine1({ name, industry, city, postal_code, services }) {
  const prov = postal_code ? beProvinceFromPostalCode(postal_code) : null;
  const loc =
    city && prov && normalizeCityName(city) !== normalizeCityName(prov) ? `${city} (province de ${prov})` : city || prov || "";
  const areaLabel = loc || city || "la région";

  const activity = shortActivityLabel(industry, services);
  const normalizedServices = normalizeServicesList(services, { max: 4 });
  const svc = formatListFr(normalizedServices.map((x) => serviceWithDefiniteArticleFr(x)).filter(Boolean));

  const variant = Number.parseInt(sha256Hex(`${name || ""}|${city || ""}`).slice(0, 2), 16) % 3;

  let s1 = "";
  let s2 = "";
  let s3 = "";

  if (variant === 0) {
    s1 = `Ã€ ${areaLabel}, ${name} intervient sur des besoins de ${activity}.`;
    s2 = svc
      ? `Les demandes traitées couvrent notamment ${svc}, avec une méthode orientée diagnostic et solution durable.`
      : `Les interventions couvrent des prestations de ${activity}, avec une méthode orientée diagnostic et solution durable.`;
    s3 = `Le premier échange sert à cadrer le besoin, expliquer les étapes et confirmer un plan d'action adapté.`;
  } else if (variant === 1) {
    s1 = `${name} accompagne les clients à ${areaLabel} pour des prestations de ${activity}.`;
    s2 = svc
      ? `Son périmètre inclut ${svc}, avec un accent sur la clarté des recommandations et la qualité d'exécution.`
      : `Son périmètre couvre des interventions de ${activity}, avec un accent sur la clarté des recommandations et la qualité d'exécution.`;
    s3 = `L'approche privilégie un diagnostic précis, puis une intervention structurée selon le contexte du chantier.`;
  } else {
    s1 = `Sur ${areaLabel}, ${name} prend en charge des interventions de ${activity} pour particuliers et professionnels.`;
    s2 = svc
      ? `Les prestations les plus courantes concernent ${svc}, avec un suivi méthodique pendant chaque étape.`
      : `Les prestations les plus courantes concernent ${activity}, avec un suivi méthodique pendant chaque étape.`;
    s3 = `Chaque demande commence par une analyse de la situation afin de proposer une solution cohérente et durable.`;
  }

  let out = finalizeNarrativeSentenceBlock(`${s1} ${s2} ${s3}`, { maxLen: 420 });

  if (out && out.length < 220) {
    const extra =
      "Le site met en avant une méthode claire: comprendre la situation, proposer une solution adaptée et expliquer les étapes de l'intervention.";
    out = finalizeNarrativeSentenceBlock(`${out} ${extra}`, { maxLen: 420 });
  }

  return out;
}

function normalizeSeedDescriptionLine1WithReason(raw, facts) {
  const cleaned = finalizeNarrativeSentenceBlock(raw, { maxLen: 420 });
  if (!cleaned) {
    if (STRICT_LLM_DESCRIPTION) return { value: null, reason: "empty_or_unparseable", candidate: null };
    return { value: buildFallbackSeedDescriptionLine1(facts), reason: "empty_or_unparseable", candidate: null };
  }
  if (!descriptionContainsCompanyName(cleaned, facts?.name || "")) {
    if (STRICT_LLM_DESCRIPTION) return { value: null, reason: "missing_company_name_token", candidate: cleaned };
    return { value: buildFallbackSeedDescriptionLine1(facts), reason: "missing_company_name_token", candidate: cleaned };
  }
  if (cleaned.length < MIN_ONE_LINER_LENGTH) {
    if (STRICT_LLM_DESCRIPTION) return { value: null, reason: "too_short", candidate: cleaned };
    return { value: buildFallbackSeedDescriptionLine1(facts), reason: "too_short", candidate: cleaned };
  }
  if (oneLinerLooksWeak(cleaned)) {
    if (STRICT_LLM_DESCRIPTION) return { value: null, reason: "weak_or_template", candidate: cleaned };
    return { value: buildFallbackSeedDescriptionLine1(facts), reason: "weak_or_template", candidate: cleaned };
  }
  return { value: cleaned, reason: null, candidate: cleaned };
}

function normalizeSeedDescriptionLine1(raw, facts) {
  return normalizeSeedDescriptionLine1WithReason(raw, facts).value;
}

function normalizeWhyCompanyQuote(raw) {
  return clampLine(sanitizePublicFreeText(raw), 320);
}

function looksFrenchEnoughText(s) {
  const t = normalizeBceForTokenMatch(s);
  if (!t) return false;
  const frHits = (t.match(/\b(le|la|les|de|des|du|un|une|avec|pour|dans|sur|intervention|qualite|securite|diagnostic|suivi)\b/g) || [])
    .length;
  const enHits = (t.match(/\b(the|and|with|for|to|of|service|quality|safety|ensure|improve|reduce|maintenance)\b/g) || [])
    .length;
  return frHits >= enHits;
}

function normalizeRitualFrequency(raw) {
  let s = clampLine(sanitizePublicFreeText(raw), 40);
  if (!s) return null;
  s = s.replace(/\bproof\s*=\s*null\b/gi, "").replace(/\bnull\b/gi, "").trim();
  if (!s) return null;
  let t = normalizeBceForTokenMatch(s);
  if (!t || /proof\s*=\s*null/.test(t)) return null;
  if (/daily/.test(t)) return "Quotidien";
  if (/weekly/.test(t)) return "Hebdomadaire";
  if (/monthly/.test(t)) return "Mensuel";
  if (/quarterly/.test(t)) return "Trimestriel";
  if (/yearly|annual/.test(t)) return "Annuel";
  return s;
}

function normalizeRituals(rituals) {
  const list = Array.isArray(rituals) ? rituals : [];
  const out = [];
  for (const r of list.slice(0, 5)) {
    const stripProofNull = (s) => (s ? s.replace(/\.?\s*proof\s*=\s*null\.?/gi, "").replace(/\bnull\b/gi, "").replace(/\s{2,}/g, " ").trim() : s);
    const name = clampLine(sanitizePublicFreeText(r?.name), 60);
    let description = clampLine(stripProofNull(sanitizePublicFreeText(r?.description)), 220);
    let frequency = normalizeRitualFrequency(r?.frequency);
    let impact = clampLine(stripProofNull(sanitizePublicFreeText(r?.impact)), 160);
    let proof = r?.proof ? clampLine(sanitizePublicFreeText(r.proof), 160) : null;

    if (description && !looksFrenchEnoughText(description)) {
      description = "Intervention structurée avec vérifications à chaque étape pour sécuriser l'exécution.";
    }
    if (impact && !looksFrenchEnoughText(impact)) {
      impact = "Améliore la fiabilité du résultat et réduit les risques de reprise ultérieure.";
    }
    if (frequency && !looksFrenchEnoughText(frequency)) {
      frequency = "Selon le besoin";
    }
    if (frequency && /\bproof\s*=\s*null\b/i.test(frequency)) frequency = null;
    if (impact && /\bproof\s*=\s*null\b/i.test(impact)) impact = null;
    if (proof && /^null$/i.test(normalizeBceForTokenMatch(proof))) proof = null;

    if (!frequency) frequency = "Selon le besoin";
    if (!impact) impact = "Améliore la fiabilité et la qualité d'exécution.";

    if (!name || !description || !frequency || !impact) continue;
    out.push({ name, description, frequency, impact, proof });
  }
  return out;
}

const EMOTIONAL_NEED_CHOICES = [
  "Confiance",
  "Sérénité",
  "Sécurité",
  "Fiabilité",
  "Proximité",
  "Réactivité",
  "Expertise",
  "Accompagnement",
  "Durabilité",
  "Clarté",
  "Soulagement",
  "Joie",
  "Réconfort",
];

function normalizeEmotionalNeedLabel(raw, { industry = "", services = [] } = {}) {
  const contextCorpus = normalizeBceForTokenMatch(
    `${industry || ""} ${Array.isArray(services) ? services.join(" ") : String(services || "")}`
  );
  const inferFromContext = () => {
    if (/(urgence|depannage|24h|7j|intervention rapide)/.test(contextCorpus)) return "Soulagement";
    if (/(devis|tarif|transparent|diagnostic|explication|conseil)/.test(contextCorpus)) return "Clarté";
    if (/(esthetique|design|renovation|facade|vitrerie|finitions)/.test(contextCorpus)) return "Joie";
    if (/(confort|bien etre|chauffage|habitat|tranquillite)/.test(contextCorpus)) return "Réconfort";
    if (/(durable|isolation|maintenance|entretien|long terme)/.test(contextCorpus)) return "Durabilité";
    if (/(proximite|local)/.test(contextCorpus)) return "Proximité";
    if (/(securite|conformite|norme|protection)/.test(contextCorpus)) return "Sécurité";
    if (/(fiabilite|garantie)/.test(contextCorpus)) return "Fiabilité";
    if (/(reactivite|rapidite)/.test(contextCorpus)) return "Réactivité";
    return null;
  };

  let rawLabel = null;
  const s = normSpaces(raw);
  if (s) {
    const key = normalizeBceForTokenMatch(s);
    if (/(confiance|trust)/.test(key)) rawLabel = "Confiance";
    else if (/(serenite|tranquillite)/.test(key)) rawLabel = "Sérénité";
    else if (/(securite|protection)/.test(key)) rawLabel = "Sécurité";
    else if (/(fiabilite)/.test(key)) rawLabel = "Fiabilité";
    else if (/(proximite)/.test(key)) rawLabel = "Proximité";
    else if (/(reactivite|rapidite)/.test(key)) rawLabel = "Réactivité";
    else if (/(expertise|competence)/.test(key)) rawLabel = "Expertise";
    else if (/(accompagnement|suivi|conseil)/.test(key)) rawLabel = "Accompagnement";
    else if (/(durabilite|long terme)/.test(key)) rawLabel = "Durabilité";
    else if (/(clarte|transparence)/.test(key)) rawLabel = "Clarté";
    else if (/(soulagement)/.test(key)) rawLabel = "Soulagement";
    else if (/(joie|plaisir)/.test(key)) rawLabel = "Joie";
    else if (/(reconfort|réconfort)/.test(key)) rawLabel = "Réconfort";
  }

  const inferred = inferFromContext();
  if (rawLabel) {
    if (["Confiance", "Sérénité", "Sécurité"].includes(rawLabel) && inferred && inferred !== rawLabel) {
      return inferred;
    }
    return rawLabel;
  }
  if (inferred) return inferred;

  // Deterministic fallback to avoid repeating only 2-3 labels.
  const seed = `${industry}|${(Array.isArray(services) ? services.join("|") : "").slice(0, 200)}|${s || ""}`;
  const idx = Number.parseInt(sha256Hex(seed).slice(0, 2), 16) % EMOTIONAL_NEED_CHOICES.length;
  return EMOTIONAL_NEED_CHOICES[idx];
}

const FAQ_MATCH_STOPWORDS = new Set([
  "comment",
  "quels",
  "quelles",
  "quel",
  "quelle",
  "pourquoi",
  "entreprise",
  "societe",
  "société",
  "cette",
  "est",
  "elle",
  "dans",
  "avec",
  "pour",
  "sur",
  "une",
  "des",
  "les",
  "aux",
  "vos",
  "votre",
]);

function faqTokensForMatch(s) {
  return normalizeBceForTokenMatch(s)
    .split(" ")
    .map((x) => x.trim())
    .filter((x) => x.length >= 4 && !FAQ_MATCH_STOPWORDS.has(x));
}

function faqAnswerLooksGeneric(answerNorm) {
  if (!answerNorm) return true;
  if (/equipe dirigeante/.test(answerNorm)) return true;
  if (/engagement envers la satisfaction client/.test(answerNorm)) return true;
  if (/solution adaptee au contexte/.test(answerNorm)) return true;
  return false;
}

function faqAnswerMatchesQuestion(question, answer) {
  const qNorm = normalizeBceForTokenMatch(question);
  const aNorm = normalizeBceForTokenMatch(answer);
  if (!qNorm || !aNorm) return false;
  if (faqAnswerLooksGeneric(aNorm)) return false;

  const qTokens = faqTokensForMatch(qNorm);
  if (!qTokens.length) return true;

  const overlap = qTokens.filter((t) => {
    if (aNorm.includes(t)) return true;
    if (t.length >= 6 && aNorm.includes(t.slice(0, 6))) return true;
    if (t.endsWith("s") && t.length >= 5 && aNorm.includes(t.slice(0, -1))) return true;
    return false;
  }).length;
  if (overlap > 0) return true;

  if (/distingu|difference|atout|pourquoi/.test(qNorm)) {
    return /(approche|methode|process|specialis|experience|qualite|suivi|reactiv|urgence|organisation)/.test(aNorm);
  }
  if (/qualite|fiabilit|durable/.test(qNorm)) {
    return /(controle|verification|suivi|checklist|protocole|rituel|diagnostic)/.test(aNorm);
  }
  if (/intervention|deroule|etape|process/.test(qNorm)) {
    return /(diagnostic|etape|intervention|deplacement|validation|echange|prise de contact)/.test(aNorm);
  }
  if (/\b(zone|secteur|commune|region|ville|ou)\b/.test(qNorm)) {
    return /(zone|secteur|commune|region|ville|voisine|proximite)/.test(aNorm);
  }

  return false;
}

function faqTopicKey(qNorm) {
  const q = normalizeBceForTokenMatch(qNorm);
  if (!q) return null;
  if (/(rgpd|privacy|donnees|personnelles|cookies|confidentialite|mention legale)/.test(q)) return "legal";
  if (/(zone|secteur|commune|region|ville|intervient)/.test(q)) return "zone";
  if (/(intervention|deroule|etape|process|premiere)/.test(q)) return "process";
  if (/(qualite|fiabilit|suivi|controle)/.test(q)) return "quality";
  if (/(distingu|difference|atout|pourquoi)/.test(q)) return "differentiation";
  if (/(types?|prestations?|services?)/.test(q)) return "services";
  return null;
}

function normalizeCompanyFaq(companyFaq) {
  const list = Array.isArray(companyFaq) ? companyFaq : [];
  const out = [];
  const seenQuestions = new Set();
  const seenAnswers = new Set();
  const seenTopics = new Set();

  for (const it of list.slice(0, 12)) {
    const qRaw = clampLine(sanitizePublicFreeText(it?.question), 180);
    let a = clampLine(sanitizePublicFreeText(it?.answer), 500);
    if (!qRaw || !a || a.length < 28) continue;

    const qNoQm = qRaw.replace(/\?+$/g, "").trim();
    if (!qNoQm) continue;

    // Skip FAQ topics already covered by dedicated blocks/components.
    const qNorm = normalizeBceForTokenMatch(qNoQm);
    if (/(contact|email|telephone|numero|adresse|horaire|ouverture|tarif|prix|devis)/.test(qNorm)) continue;
    const topic = faqTopicKey(qNorm);
    if (topic === "legal") continue;
    if (topic && seenTopics.has(topic)) continue;

    if (topic === "differentiation") {
      const match = a.match(/^l'entreprise se distingue par\s+(.+)$/i);
      if (match) {
        let tail = normSpaces(match[1] || "");
        if (tail) {
          if (/^[a-zà-ÿ]/u.test(tail)) tail = tail.slice(0, 1).toUpperCase() + tail.slice(1);
          if (!/[.!?]$/.test(tail)) tail = `${tail}.`;
          a = tail;
        }
      }
    }

    if (!faqAnswerMatchesQuestion(qNoQm, a)) continue;

    const qKey = normalizeCityName(qNoQm);
    if (!qKey || seenQuestions.has(qKey)) continue;

    const aKey = normalizeCityName(a);
    if (!aKey || seenAnswers.has(aKey)) continue;

    seenQuestions.add(qKey);
    seenAnswers.add(aKey);
    if (topic) seenTopics.add(topic);
    out.push({
      question: `${qNoQm}?`,
      answer: a,
    });
    if (out.length >= 6) break;
  }

  return out;
}

function buildFallbackCompanyFaq({ services, zone, firstContact, rituals, whyCompany, availability }) {
  const normalizedServices = normalizeServicesList(services, { max: 5 });
  const servicesText = formatListFr(normalizedServices.map((x) => String(x).toLowerCase()));
  const zoneText = clampLine(sanitizePublicFreeText(zone), 120);
  const firstContactText = clampLine(sanitizePublicFreeText(firstContact), 320);
  const whyCompanyText = clampLine(sanitizePublicFreeText(whyCompany), 320);
  const availabilityText = clampLine(sanitizePublicFreeText(availability), 140);
  const variant = Number.parseInt(
    sha256Hex(`${zoneText || ""}|${servicesText || ""}|${normalizedServices.join("|")}`).slice(0, 2),
    16
  ) % 3;
  const qualityFromRitualRaw = clampLine(
    sanitizePublicFreeText(Array.isArray(rituals) && rituals[0]?.description),
    220
  );
  const qualityRitualNorm = normalizeBceForTokenMatch(qualityFromRitualRaw || "");
  const qualityFromRitual =
    qualityFromRitualRaw && /(controle|verification|suivi|maintenance|checklist|protocole|diagnostic)/.test(qualityRitualNorm)
      ? `La qualité est suivie via ${qualityFromRitualRaw.replace(/^\p{Lu}/u, (m) => m.toLowerCase())}`
      : null;

  const qas = [];

  for (const svc of normalizedServices.slice(0, 3)) {
    const serviceLower = String(svc || "").toLowerCase();
    if (!serviceLower) continue;
    // Use "des services de/d'" to avoid masculine/feminine article errors (réparation, installation, etc.)
    const VOWELS_DE = /^[aeiouyàâäéèêëîïôùûüœæh]/i;
    const dePrep = VOWELS_DE.test(serviceLower) ? "d'" : "de ";
    qas.push({
      question: `Proposez-vous des services ${dePrep}${serviceLower}`,
      answer: `Oui, cette prestation fait partie des interventions proposées, avec vérification sur place avant réalisation.`,
    });
  }

  qas.push({
    question: "Quelles sont les prestations les plus fréquentes",
    answer:
      servicesText
        ? `Les demandes les plus courantes portent sur ${servicesText}. Chaque intervention débute par un diagnostic pour proposer la solution adaptée.`
        : "Les interventions couvrent les besoins courants et urgents, avec un diagnostic préalable pour orienter la solution.",
  });

  qas.push(
    {
      question: "Comment se déroule un premier contact",
      answer:
        firstContactText ||
        "Le premier échange permet de cerner précisément le besoin, d'établir un diagnostic et de proposer un plan d'action concret.",
    },
    {
      question:
        variant === 0
          ? "Sur quelle zone géographique intervenez-vous"
          : variant === 1
            ? "Quelles communes sont couvertes en priorité"
            : "Intervenez-vous au-delà de la zone principale",
      answer:
        zoneText
          ? variant === 2
            ? `La zone principale est ${zoneText}. Les interventions peuvent s'étendre aux communes limitrophes selon le besoin.`
            : `Les interventions se déroulent à ${zoneText} et dans les communes avoisinantes, selon la nature de la demande.`
          : variant === 1
            ? "Les interventions se concentrent sur la zone locale et les communes proches."
            : "La zone d'intervention couvre le secteur local et les communes environnantes.",
    },
    {
      question:
        variant === 0
          ? "Comment la qualité des interventions est-elle assurée"
          : variant === 1
            ? "Quelles garanties offrez-vous sur vos interventions"
            : "Comment contrôlez-vous la qualité de votre travail",
      answer:
        qualityFromRitual ||
        (variant === 2
          ? "Des vérifications sont effectuées à chaque étape de l'intervention, avec un contrôle final avant la clôture du chantier."
          : "La qualité repose sur une méthode structurée, des vérifications en cours d'intervention et un contrôle final avant la remise au client."),
    },
  );

  if (availabilityText && /(24h|7j|urgence|sur rendez-vous|garde)/i.test(availabilityText)) {
    qas.push({
      question: "Gérez-vous les interventions urgentes",
      answer: `Oui, l'organisation permet de répondre aux demandes urgentes (${availabilityText}).`,
    });
  }

  if (whyCompanyText) {
    const whyAlreadySelfStanding =
      /[.!?]$/.test(whyCompanyText) || /^(nous|notre|l'entreprise|entreprise|chez)\b/i.test(whyCompanyText);
    const whyAlreadyPrefixed = /^l'entreprise se distingue par\b/i.test(whyCompanyText);
    const whyAnswer = whyAlreadyPrefixed
      ? whyCompanyText
      : whyAlreadySelfStanding
        ? whyCompanyText
        : `Ce qui les distingue localement: ${whyCompanyText}`;
    qas.push({
      question: variant === 1 ? "Quelle est votre valeur ajoutee locale" : "Qu'est-ce qui vous distingue sur votre marche local",
      answer: whyAnswer,
    });
  }

  const normalized = normalizeCompanyFaq(
    qas.map((x) => ({
      question: x.question,
      answer: x.answer,
    }))
  );
  if (normalized.length >= 4) return normalized.slice(0, 6);

  // Keep useful Q/R when strict filtering is too aggressive.
  const direct = [];
  for (const x of qas) {
    const q = clampLine(sanitizePublicFreeText(x.question), 180);
    const a = clampLine(sanitizePublicFreeText(x.answer), 500);
    if (!q || !a || a.length < 24) continue;
    direct.push({ question: `${q.replace(/\?+$/, "").trim()}?`, answer: a });
    if (direct.length >= 6) break;
  }
  return direct;
}

async function enrichWithLLM({
  industry,
  cityHint,
  companyName,
  corpus,
  priceRangeHint,
  devisGratuitHint,
  retryAttempt = 1,
  maxAttempts = 1,
  temperature = null,
}) {
  const schema = {
    type: "object",
    properties: {
      one_liner: { type: "string" },
      services_principaux: { type: "string" },
      zone_intervention: { type: "string" },
      premiere_prise_contact: { type: "string" },
      piliers: { type: "array", items: { type: "string" }, minItems: 2 },
      tags: { type: "array", items: { type: "string" }, minItems: 6 },
      emotional_need_label: { type: "string" },
      why_company: { type: "string" },
      why_entrepreneur: { type: ["string", "null"] },
      pricing_model: {
        type: ["string", "null"],
        enum: ["sur_devis", "forfait", "horaire", "a_partir_de", null],
      },
      budget_level: { type: ["integer", "null"], minimum: 1, maximum: 3 },
      price_indication: { type: ["string", "null"] },
      company_faq: {
        type: ["array", "null"],
        items: {
          type: "object",
          properties: {
            question: { type: "string" },
            answer: { type: "string" },
          },
          required: ["question", "answer"],
        },
      },
      rituals: {
        type: "array",
        minItems: 3,
        items: {
          type: "object",
          properties: {
            name: { type: "string" },
            description: { type: "string" },
            frequency: { type: "string" },
            proof: { type: ["string", "null"] },
            impact: { type: "string" },
          },
          required: ["name", "description", "frequency", "impact"],
        },
      },
      ras_score: { type: "number" },
      confidence: { type: "number" },
    },
    required: ["one_liner", "services_principaux", "tags", "rituals", "emotional_need_label", "why_company", "ras_score", "confidence"],
  };

  const retryHints =
    retryAttempt > 1
      ? `\nCONSIGNES RETRY (${retryAttempt}/${maxAttempts}):\n- Reformule de maniere differente des essais precedents.\n- Interdiction absolue d'utiliser "X est une entreprise de Y active a Z".\n- Commence directement par une action, un benefice concret ou une specialisation.\n`
      : "";

  const prompt = `
Tu aides a creer une fiche entreprise "a revendiquer".
Nom de l'entreprise: ${companyName || "(inconnu)"}
Metier: ${industry}
Commune (indice): ${cityHint}

Regles CRITIQUES:
- Interdiction: ne mets jamais le mot "Revelys" dans les champs (pas de prefixe "Revelys:", pas de mention).
- Ne promets rien que tu n'as pas vu. Si tu deduis, formule prudemment.
- Langue obligatoire: francais uniquement (aucun mot ou phrase en anglais).
- Aucun prix, aucun numero de telephone, aucun email dans one_liner / why_company.
- Rituels: donne 3 a 5 rituels CONCRETS et OBSERVABLES propres au metier. Si deduction, mets proof=null.
- Tags: courts, en francais, 6 a 12. Pas de snake_case, pas d'underscore, pas de mots colles. Utilise des espaces si besoin.
- IMPORTANT TAGS: evite les tags creux ("professionnel", "qualite", "meilleur", "service", "rapide", "expert" si non prouve).
- INTERDICTION TAGS ABSOLUE: ne mets JAMAIS dans les tags un terme deja present dans services_principaux (ni le meme mot, ni une reformulation). Exemple interdit si services contient "Reparation de fuites": ne mets pas "reparation de fuites", "fuite", "reparation" dans les tags. Les tags sont des angles complementaires: problemes vecus par le client, situations d'urgence, equipements specifiques, qualifications, certifications.
- VOCABULAIRE TAGS: pense du point de vue du client (ce qu'il tape ou dit): probleme concret ("fuite d'eau", "canalisation bouchee", "serrure bloquee"), situation urgente, equipement specifique, qualification distinctive. Evite le vocabulaire marketing.
- STRUCTURE TAGS: vise 3-5 tags cote client (problemes/situations), 1-2 modalites (urgence, devis gratuit, 24h), 1 signature distinctive max.
- one_liner (description publique complete de la fiche): 220-420 caracteres, 2-4 phrases fluides, 3e personne du singulier, francais naturel (ton premium, pas robotique), toujours termine par une phrase complete. OBLIGATOIRE: le nom de l'entreprise doit apparaitre dans la premiere phrase. Ce champ est afiche tel quel comme description publique — autonome, sans style telegraphique ni liste. Il n'y aura aucun ajout de template apres.
- INTERDIT one_liner: "X est une entreprise de Y active a Z", "X offre un service de Y", "X propose des services de Y", "X garantit une intervention rapide", formulations creuses ("equipe de professionnels", "qualite et proximite"), superlatifs non prouves. Commence par une caracteristique distinctive, une specialisation concrete ou une zone forte.
- services_principaux: 3 a 6 items, separes par des virgules (pas de tirets), sans ville, sans texte additionnel, avec grammaire correcte (ex: "Reparation de fuites", "Entretien du chauffe-eau").
- Besoin emotionnel (emotional_need_label): 1 label parmi: Serenite, Confiance, Securite, Reconfort, Joie, Clarte, Soulagement, Durabilite, Tranquillite, Fierte, Efficacite.
  Choisis selon le METIER et le ton du site: plombier/debouchage → Soulagement; electricien → Securite; serrurier → Securite ou Confiance; chauffagiste/climatisation → Serenite ou Reconfort; couvreur → Durabilite ou Tranquillite; peintre/facade/crepi → Fierte ou Joie.
  IMPORTANT: adapte au contenu reel du site. Si le site insiste sur l'urgence → Soulagement; sur la duree de vie → Durabilite; sur la satisfaction client → Joie; sur la tranquillite d'esprit → Serenite. Ne choisis pas systematiquement le meme label.
- why_company (affiche comme citation): 180-320 caracteres, 1-2 phrases max, orientees "differenciation", base sur le texte.
- rituals: 3 a 5 items. description 120-220 caracteres. Nom du rituel: 3-7 mots, specifique au metier.
  CONCRETS: decrire un geste, une etape, un outil, un protocole observable (ex: "Photo avant/apres intervention", "Nettoyage avec aspirateur industriel", "Test de pression sur la canalisation reparee", "Verification du tableau electrique avant mise en service").
  INTERDITS (trop generiques, sauf si explicitement mentionnes sur le site): "Formation continue", "Suivi post-intervention", "Evaluation prealable", "Analyse prealable", "Controle qualite", "Engagement qualite", "Bilan de fin de chantier", "Certification des equipes". Ces termes vagues ne constituent pas un rituel concret.
- company_faq: 4 a 6 Q/R specifiques et utiles au site. La FAQ generee par le LLM est utilisee directement sans melange avec des templates — vise 4 a 6 entrees de qualite si le corpus le permet. Si le corpus est vraiment trop pauvre pour produire des reponses specifiques, laisse le tableau vide [] — une FAQ vide est preferable a des generiques.
  Contenu: base sur le contenu reel du site (tarifs, delais, certifications, marques, materiaux, garanties, zone precise, urgences, disponibilite). Pas de "Quels types d'interventions" ou "Comment la qualite est garantie" si le site ne le mentionne pas.
  Reponses: phrases completes et naturelles (pas de listes a puces), style direct, 80-300 caracteres par reponse.
  Format question: francais correct avec accents, phrase naturelle se terminant par "?" implicitement.
  INTERDITS FAQ: questions generiques sans ancrage dans le contenu du site, doublons semantiques, formulations marketing ("Nous garantissons", "Notre equipe de professionnels").
- ras_score: 0-100, note de richesse exploitable du site. CALIBRATION STRICTE:
  90-100 = site tres riche (services detailles, temoignages, tarifs, zone, FAQ, photos, histoire, equipe).
  70-89 = site correct (2-3 sections utiles, quelques specificites, contenu substantiel).
  50-69 = site pauvre (generique, textes courts, peu d'informations exploitables).
  30-49 = site tres pauvre (quasi vide, pure SEO ou page unique sans contenu).
  0-29 = site inutilisable (erreur, parking, redirect, contenu inexistant).
  SOIS STRICT: un site avec 200 mots generiques ne depasse pas 60. Ne mets pas 95 par defaut.
- Pricing (optionnel): si le site mentionne des tarifs, complete:
  - pricing_model: sur_devis | forfait | horaire | a_partir_de
  - budget_level: 1..3 (1=EUR, 2=EUR EUR, 3=EUR EUR EUR)
  - price_indication: texte court (ex: "A partir de 45EUR/seance")
  Si tu n'es pas sur ou si ce n'est pas explicite, mets null. Ne devine pas.
- Reponds STRICTEMENT en JSON (aucun texte hors JSON).
${retryHints}

PRICING_HINTS (si dispo):
- jsonld_priceRange: ${priceRangeHint || "null"}
- devis_gratuit_detected: ${devisGratuitHint ? "true" : "false"}

CONTENU CRAWLE:
${corpus}
`.trim();

  return await ollamaJson(schema, prompt, { temperature });
}

function clampLine(s, maxLen) {
  if (s === null || s === undefined) return null;
  const v = normSpaces(String(s)).replace(/\s*\n\s*/g, " ").trim();
  if (!v) return null;
  if (!Number.isFinite(maxLen) || maxLen <= 0) return v;
  if (v.length <= maxLen) return v;
  return v.slice(0, maxLen).replace(/\s+\S*$/, "").trim() || v.slice(0, maxLen).trim();
}

function clampAtWordBoundary(s, maxLen, { ellipsis = false } = {}) {
  if (s === null || s === undefined) return null;
  const v = normSpaces(String(s)).replace(/\s*\n\s*/g, " ").trim();
  if (!v) return null;
  if (!Number.isFinite(maxLen) || maxLen <= 0) return v;
  if (v.length <= maxLen) return v;

  const suffix = ellipsis ? "..." : "";
  const room = maxLen - suffix.length;
  if (room < 8) return null;

  let cut = -1;
  for (let i = Math.min(room, v.length - 1); i >= 0; i--) {
    const ch = v[i];
    if (/\s|[,:;.!?\/\-–—()]/.test(ch)) {
      cut = i;
      break;
    }
  }

  // No safe boundary before limit => avoid mid-word cut.
  if (cut < 0) return null;

  let out = v
    .slice(0, cut)
    .replace(/[,:;.!?\/\-–—()]+$/g, "")
    .trim();
  if (!out) return null;

  if (suffix && out.length + suffix.length <= maxLen) out += suffix;
  return out;
}

function normalizeSeoTextNoTrunc(s) {
  if (s === null || s === undefined) return null;
  const cleaned = stripPhoneLikeFromSeoText(s);
  const out = normSpaces(String(cleaned || "").replace(/\s*\n\s*/g, " ").trim());
  return out || null;
}

function stripPhoneKeysDeep(value) {
  if (!value) return;
  if (Array.isArray(value)) {
    for (const x of value) stripPhoneKeysDeep(x);
    return;
  }
  if (typeof value !== "object") return;

  for (const k of Object.keys(value)) {
    const key = String(k).toLowerCase();
    if (key === "telephone" || key === "faxnumber" || key === "phonenumber") {
      delete value[k];
      continue;
    }
    stripPhoneKeysDeep(value[k]);
  }
}

function stripPhoneValuesDeep(value) {
  if (value === null || value === undefined) return;

  if (Array.isArray(value)) {
    for (let i = 0; i < value.length; i++) {
      const v = value[i];
      if (typeof v === "string") {
        // If a string is only a phone number, drop it; otherwise keep the cleaned text.
        const cleaned = stripPhoneLikeFromSeoText(v);
        if (cleaned === null) value[i] = "";
        else value[i] = cleaned;
        continue;
      }
      stripPhoneValuesDeep(v);
    }
    return;
  }

  if (typeof value !== "object") return;

  for (const k of Object.keys(value)) {
    const v = value[k];
    if (typeof v === "string") {
      const cleaned = stripPhoneLikeFromSeoText(v);
      if (!cleaned) delete value[k];
      else value[k] = cleaned;
      continue;
    }
    stripPhoneValuesDeep(v);
  }
}

function stripExternalImageUrlsDeep(value) {
  if (value === null || value === undefined) return;

  if (Array.isArray(value)) {
    for (const v of value) stripExternalImageUrlsDeep(v);
    return;
  }

  if (typeof value !== "object") return;

  const isImageLikeKey = (k) => {
    const key = String(k || "").toLowerCase();
    return key === "image" || key === "logo" || key === "thumbnailurl" || key === "contenturl";
  };

  for (const k of Object.keys(value)) {
    const v = value[k];
    const imageLike = isImageLikeKey(k);

    if (imageLike && typeof v === "string") {
      if (!isSupabaseStorageUrl(v)) delete value[k];
      continue;
    }

    if (imageLike && Array.isArray(v)) {
      const kept = v.filter((item) => {
        if (typeof item === "string") return isSupabaseStorageUrl(item);
        if (!item || typeof item !== "object") return false;
        const candidates = [item.url, item.contentUrl].filter((x) => typeof x === "string");
        return candidates.some((x) => isSupabaseStorageUrl(x));
      });
      if (!kept.length) delete value[k];
      else value[k] = kept;
      continue;
    }

    if (imageLike && v && typeof v === "object" && !Array.isArray(v)) {
      const candidates = [v.url, v.contentUrl].filter((x) => typeof x === "string");
      if (!candidates.length || !candidates.some((x) => isSupabaseStorageUrl(x))) {
        delete value[k];
        continue;
      }
    }

    stripExternalImageUrlsDeep(v);
  }
}

function stripPhoneLikeFromSeoText(s) {
  if (s === null || s === undefined) return null;
  // Conservative remover: only strips patterns that normalize as a BE phone number.
  const v = normSpaces(stripPhonesFromFreeText(String(s)));
  return v || null;
}

function makeCompanyPublicUrl(slug) {
  if (!slug) return null;
  return `${PUBLIC_COMPANY_URL_BASE}/${slug}`;
}

function buildFallbackSeoPack(facts) {
  const name = normSpaces(facts?.name);
  const industry = normSpaces(facts?.industry);
  const city = normSpaces(facts?.city);

  const seo_title = normalizeSeoTextNoTrunc(`${industry} à ${city} | ${name}`);
  const seo_description = normalizeSeoTextNoTrunc(
    (() => {
      const services = Array.isArray(facts?.services) ? facts.services.filter(Boolean) : [];
      const lead = services[0] || industry.toLowerCase();
      const pricing =
        facts?.devis_gratuit === true
          ? "Devis gratuit."
          : facts?.pricing_model
            ? "Tarification sur demande."
            : "";
      return `Intervention à ${city} pour ${lead}. ${pricing} Disponible pour un premier contact rapide.`;
    })(),
  );

  const og_title = normalizeSeoTextNoTrunc(seo_title);
  const og_description = normalizeSeoTextNoTrunc(seo_description);

  const keywords = []
    .concat(Array.isArray(facts?.tags) ? facts.tags : [])
    .concat(Array.isArray(facts?.services) ? facts.services : [])
    .map((x) => normSpaces(x))
    .filter(Boolean)
    .slice(0, 20);

  const seo_jsonld = {
    "@context": "https://schema.org",
    "@type": "LocalBusiness",
    name: name || undefined,
    url: facts?.revelys_url || undefined,
    sameAs: facts?.website ? [facts.website] : undefined,
    description: clampLine(facts?.why_company, 180) || undefined,
    image: facts?.cover_image_url || undefined,
    logo: facts?.logo_url || undefined,
    keywords: keywords.length ? keywords.join(", ") : undefined,
    address:
      facts?.address || facts?.postal_code || facts?.city
        ? {
            "@type": "PostalAddress",
            streetAddress: facts?.address || undefined,
            postalCode: facts?.postal_code || undefined,
            addressLocality: facts?.city || undefined,
            addressCountry: facts?.country || undefined,
          }
        : undefined,
    areaServed: facts?.city ? [facts.city] : undefined,
  };

  return { seo_title, seo_description, og_title, og_description, seo_jsonld };
}

async function enrichSeoWithLLM(facts) {
  const schema = {
    type: "object",
    properties: {
      seo_title: { type: ["string", "null"] },
      seo_description: { type: ["string", "null"] },
      og_title: { type: ["string", "null"] },
      og_description: { type: ["string", "null"] },
      seo_jsonld: { type: ["object", "null"] },
    },
    required: ["seo_title", "seo_description", "og_title", "og_description", "seo_jsonld"],
  };

  const factsJson = JSON.stringify(facts);

  const prompt = `
Tu generes un pack SEO + GEO pour une fiche entreprise NON revendiquee.
IMPORTANT: tu dois etre factuel et ne rien inventer.

ENTREE (JSON factuel):
${factsJson}

SORTIE: reponds STRICTEMENT en JSON (aucun texte hors JSON) avec les champs:
- seo_title (30-60 caracteres): format "[Activite] a [Ville] | [Nom entreprise]". Doit contenir la ville et le nom.
- seo_description (140-170 caracteres): interdit d'inclure le nom de l'entreprise. 1 benefice concret + 1 preuve/differenciant + contexte local.
- og_title (<=70) et og_description (<=200): plus humain, sans superlatifs.
- seo_jsonld: objet JSON-LD schema.org (ou null). Doit contenir:
  - "@context": "https://schema.org"
  - "@type": le plus specifique possible si certain, sinon "LocalBusiness"
  - name, description (courte), url (URL Revelys de la fiche), sameAs (site officiel si dispo)
  - address (PostalAddress) si adresse/CP/ville dispo
  - areaServed (ville/zone) si dispo
  - keywords (tags + services) si dispo
  - image/logo si dispo

INTERDITS (dans seo_title/seo_description/og_title/og_description):
- "revelys", "verified", "score", "premium", "meilleur", "#1", "RAS"
- aucun "@"
- aucun numero de telephone

ZERO HALLUCINATION:
- si une info n'est pas dans l'entree, mets null (ou omets dans seo_jsonld).
`.trim();

  const out = await ollamaJson(schema, prompt);
  if (!out) return null;

  // Light normalization/safety (length + single line)
  const fallback = buildFallbackSeoPack(facts);
  out.seo_title = normalizeSeoTextNoTrunc(out.seo_title) || normalizeSeoTextNoTrunc(fallback.seo_title);
  out.seo_description = normalizeSeoTextNoTrunc(out.seo_description) || normalizeSeoTextNoTrunc(fallback.seo_description);
  out.og_title =
    normalizeSeoTextNoTrunc(out.og_title) ||
    normalizeSeoTextNoTrunc(out.seo_title) ||
    normalizeSeoTextNoTrunc(fallback.og_title);
  out.og_description =
    normalizeSeoTextNoTrunc(out.og_description) ||
    normalizeSeoTextNoTrunc(out.seo_description) ||
    normalizeSeoTextNoTrunc(fallback.og_description);

  // Basic JSON-LD sanity
  if (out.seo_jsonld && (typeof out.seo_jsonld !== "object" || Array.isArray(out.seo_jsonld))) {
    out.seo_jsonld = null;
  }
  if (out.seo_jsonld && !out.seo_jsonld["@context"]) out.seo_jsonld["@context"] = "https://schema.org";

  // Do not expose phone numbers in public SEO payloads (user should go to the official site for that).
  if (out.seo_jsonld) {
    stripPhoneKeysDeep(out.seo_jsonld);
    stripPhoneValuesDeep(out.seo_jsonld);
    if (UPLOAD_IMAGES) stripExternalImageUrlsDeep(out.seo_jsonld);

    // Keep sameAs normalized as an array of URLs.
    const sameAsRaw = out.seo_jsonld.sameAs;
    const sameAsList = (Array.isArray(sameAsRaw) ? sameAsRaw : hasScalarValue(sameAsRaw) ? [sameAsRaw] : [])
      .map((x) => normalizePublicUrl(x))
      .filter(Boolean);
    if (sameAsList.length) out.seo_jsonld.sameAs = Array.from(new Set(sameAsList));
    else delete out.seo_jsonld.sameAs;
  }

  return out;
}

// -------------------- SQL BUILD --------------------
function buildCompanyInsertSQL(row) {
  const cols = [
    "slug",
    "market",
    "name",
    "industry",
    "tags",
    "services",
    "pricing_model",
    "budget_level",
    "price_indication",
    "devis_gratuit",
    "city",
    "postal_code",
    "country",
    "address",
    "website",
    "facebook",
    "instagram",
    "linkedin",
    "tiktok",
    "contact_email",
    "contact_name",
    "contact_phone",
    "public_phone",
    "founder_name",
    "founder_role",
    "founder_photo_url",
    "ideal_zone",
    "languages",
    "availability",
    "opening_hours",
    "google_rating",
    "google_reviews_count",
    "google_reviews",
    "company_faq",
    "description",
    "seo_title",
    "seo_description",
    "og_title",
    "og_description",
    "seo_jsonld",
    "seo_generated_at",
    "seo_ai_used",
    "seo_version",
    "seo_last_inputs_hash",
    "why_company",
    "why_entrepreneur",
    "emotional_need_label",
    "rituals",
    "ras_score",
    "logo_url",
    "cover_image_url",
    "gallery_urls",
    "video_urls",
    "video_embed_url",
    "tier",
    "verification_status",
    "verification_checks",
    "content_status",
    "is_claimed",
    "opt_out_status",
    "bce_number",
    "bce_status",
    "bce_source",
    "bce_verified_at",
    "bce_last_checked_at",
    "bce_legal_name",
    "founded_on",
    "bce_type_of_enterprise",
    "bce_juridical_form",
    "bce_juridical_situation",
    "bce_source_update_date",
    "created_at",
  ];

  const vals = [
    row.slug,
    row.market,
    row.name,
    row.industry,
    row.tags,
    row.services,
    row.pricing_model,
    row.budget_level,
    row.price_indication,
    row.devis_gratuit,
    row.city,
    row.postal_code,
    row.country,
    row.address,
    row.website,
    row.facebook,
    row.instagram,
    row.linkedin,
    row.tiktok,
    row.contact_email,
    row.contact_name,
    row.contact_phone,
    row.public_phone,
    row.founder_name,
    row.founder_role,
    row.founder_photo_url,
    row.ideal_zone,
    row.languages,
    row.availability,
    row.opening_hours,
    row.google_rating,
    row.google_reviews_count,
    row.google_reviews,
    row.company_faq,
    row.description,
    row.seo_title,
    row.seo_description,
    row.og_title,
    row.og_description,
    row.seo_jsonld,
    row.seo_generated_at,
    row.seo_ai_used,
    row.seo_version,
    row.seo_last_inputs_hash,
    row.why_company,
    row.why_entrepreneur,
    row.emotional_need_label,
    row.rituals,
    row.ras_score,
    row.logo_url,
    row.cover_image_url,
    row.gallery_urls,
    row.video_urls,
    row.video_embed_url,
    row.tier,
    row.verification_status,
    row.verification_checks,
    row.content_status,
    row.is_claimed,
    row.opt_out_status,
    row.bce_number,
    row.bce_status,
    row.bce_source,
    row.bce_verified_at,
    row.bce_last_checked_at,
    row.bce_legal_name,
    row.founded_on,
    row.bce_type_of_enterprise,
    row.bce_juridical_form,
    row.bce_juridical_situation,
    row.bce_source_update_date,
    row.created_at,
  ].map(sqlValue);

  const updateCols = cols.filter((c) => c !== "slug" && c !== "created_at");
  const setClause = updateCols.map((c) => `${c} = EXCLUDED.${c}`).join(", ");
  const insertSelect = vals.join(", ");
  const bceDedupClause = hasScalarValue(row.bce_number)
    ? `
WHERE NOT EXISTS (
  SELECT 1
  FROM companies c_existing
  WHERE c_existing.bce_number = ${sqlValue(row.bce_number)}
    AND c_existing.slug <> ${sqlValue(row.slug)}
)`
    : "";

  return `INSERT INTO companies (${cols.join(", ")})
SELECT ${insertSelect}${bceDedupClause}
ON CONFLICT (slug) DO UPDATE SET ${setClause}
WHERE companies.is_claimed IS NOT TRUE;`;
}

// -------------------- MAIN --------------------
async function main() {
  if ((UPLOAD_IMAGES || VERIFY_BCE_WITH_KBO) && !supabase) {
    console.error("SUPABASE_URL / SUPABASE_SERVICE_ROLE_KEY missing (required for image upload or KBO verification).");
    process.exit(1);
  }
  if (SEARCH_PROVIDER === "serper" && !SERPER_API_KEY) {
    console.error("SERPER_API_KEY missing (required when SEARCH_PROVIDER=serper).");
    process.exit(1);
  }
  if (SEARCH_PROVIDER === "serper" && !BRAVE_API_KEY) {
    console.warn("SEARCH_PROVIDER=serper but BRAVE_API_KEY missing: no Brave fallback will be available if Serper is down.");
  }
  if (SEARCH_PROVIDER === "brave" && !BRAVE_API_KEY) {
    console.error("BRAVE_API_KEY missing (required when SEARCH_PROVIDER=brave).");
    process.exit(1);
  }
  if (!CITIES.length || !INDUSTRIES.length) {
    console.error("Missing configs: config/cities.json and config/industries.json");
    process.exit(1);
  }

  ensureDir(path.join(__dirname, OUT_DIR_NAME));
  const outDir = path.join(__dirname, OUT_DIR_NAME);
  const outSql = path.join(outDir, "seed_companies.sql");
  const outLog = path.join(outDir, "run_log.ndjson");
  const outStats = path.join(outDir, "stats.json");

  logInfo(
    `[Config] out=${outDir} | provider=${SEARCH_PROVIDER}${
      SEARCH_PROVIDER === "serper" ? ` (fallback=${BRAVE_API_KEY ? "brave" : "none"})` : ""
    } | resume=${RESUME ? "on" : "off"} | strict=${STRICT_CITY_MATCH ? `on (${LOCATION_MATCH_MODE})` : "off"} | images=${
      UPLOAD_IMAGES ? "upload" : "external"
    } | seo=${GENERATE_SEO ? "on" : "off"} | priority=${PRIORITY_FROM_LOG ? "log" : "off"}`
  );

  let localBceDataset = null;
  if (USE_LOCAL_BCE_DATA) {
    localBceDataset = await loadLocalBceDataset();
    if (!localBceDataset) {
      console.warn(`[BCE] local dataset not found or incomplete in ${BCE_DIR}. Continuing without CSV cross-check.`);
    }
  }

  const pairCounts = new Map(); // `${industry}||${city}` => count
  const usedBce = new Set();
  const usedSlugs = new Set();
  const usedDomainsGlobal = new Set();
  const usedDomainsByPair = new Map(); // key => Set(domains)
  let priorityCandidatesByPair = new Map();
  let priorityCandidatesByNormPair = new Map();

  if (PRIORITY_FROM_LOG) {
    let totalOk = 0;
    let kept = 0;
    let skippedNonOfficial = 0;
    const loadedPaths = [];

    for (const priorityLogPath of PRIORITY_LOG_PATHS) {
      const priority = await loadPriorityCandidatesFromLog(priorityLogPath);
      if (!priority.loaded) continue;
      loadedPaths.push(priority.path);
      totalOk += Number(priority.totalOk || 0);
      kept += Number(priority.kept || 0);
      skippedNonOfficial += Number(priority.skippedNonOfficial || 0);
      mergePriorityCandidatesMap(priorityCandidatesByPair, priority.byPair);
      mergePriorityCandidatesMap(priorityCandidatesByNormPair, priority.byNormPair);
    }

    if (loadedPaths.length) {
      logInfo(
        `[Priority] loaded ${kept}/${totalOk} candidate(s) from ${loadedPaths.join(", ")} | pairs=${priorityCandidatesByPair.size} | filtered_non_official=${skippedNonOfficial}`
      );
    } else {
      logInfo(`[Priority] no log file at ${PRIORITY_LOG_PATHS.join(", ")}; search-only mode.`);
    }
  }

  const resumeMode = RESUME && (fileExists(outLog) || fileExists(outSql) || fileExists(outStats));

  if (!resumeMode) {
    if (!DRY_RUN) {
      fs.writeFileSync(outSql, `-- Seed Revelys (market=${MARKET})\n\n`, "utf-8");
    }
    fs.writeFileSync(outLog, "", "utf-8");
    fs.writeFileSync(outStats, "{}", "utf-8");
  } else {
    if (!fileExists(outLog)) fs.writeFileSync(outLog, "", "utf-8");
    if (!fileExists(outStats)) fs.writeFileSync(outStats, "{}", "utf-8");
    if (!DRY_RUN && !fileExists(outSql)) {
      fs.writeFileSync(outSql, `-- Seed Revelys (market=${MARKET})\n\n`, "utf-8");
    }

    const { okCount } = await loadResumeFromLog(outLog, { usedBce, usedDomainsGlobal, usedDomainsByPair, pairCounts });
    if (okCount) console.log(`Resume: ${okCount} companies already processed (status=ok).`);

    // Refresh stats file based on resumed counts (useful for quick monitoring).
    const statsObj = {};
    for (const [k, v] of pairCounts.entries()) statsObj[k] = v;
    fs.writeFileSync(outStats, JSON.stringify(statsObj, null, 2), "utf-8");
  }

  if (!DRY_RUN && fileExists(outSql)) {
    const loadedSlugs = loadUsedSlugsFromSql(outSql, usedSlugs);
    if (loadedSlugs) logDebug(`[Resume] loaded ${loadedSlugs} existing slug(s) from seed SQL`);
    const loadedBce = loadUsedBceFromSql(outSql, usedBce);
    if (loadedBce) logDebug(`[Resume] loaded ${loadedBce} existing BCE number(s) from seed SQL`);
  }

  // Current processing is sequential by design (quality guards + deterministic logging).
  // Keep p-limit wrapper for a future opt-in parallel mode.
  const limitSite = pLimit(1);

  function log(obj) {
    fs.appendFileSync(outLog, JSON.stringify({ ts: new Date().toISOString(), ...obj }) + "\n", "utf-8");
  }

  for (const city of CITIES) {
    for (const industry of INDUSTRIES) {
      const key = `${industry}||${city}`;
      pairCounts.set(key, pairCounts.get(key) || 0);

      if (pairCounts.get(key) >= TARGET_PER_PAIR) continue;

      console.log(`\n=== ${industry} | ${city} (target ${TARGET_PER_PAIR}) ===`);

      const queryPlans = buildSearchQueries(industry, city);
      logInfo(`[Search] ${queryPlans.length} query(ies): ${queryPlans.map((x) => x.query).join(" | ")}`);

      const usedSet =
        USED_DOMAINS_SCOPE === "pair"
          ? usedDomainsByPair.get(key) || (usedDomainsByPair.set(key, new Set()), usedDomainsByPair.get(key))
          : usedDomainsGlobal;

      const searchPageSize =
        SEARCH_PROVIDER === "serper" ? SERPER_COUNT : SEARCH_PROVIDER === "brave" ? BRAVE_COUNT : 10;
      const planStates = queryPlans.map((plan) => ({
        ...plan,
        page: 1,
        offset: 0,
        fetches: 0,
        exhausted: false,
      }));
      const maxSearchPagesPerPlan = Math.max(1, Math.ceil(MAX_SITES_TO_TRY / Math.max(1, planStates.length)) + 2);
      const seenCandidateDomains = new Set();
      const candidateQueue = [];
      let nextPlanCursor = 0;

      const enqueueCandidate = (c) => {
        if (!c?.url) return false;
        const reason = nonOfficialSiteReason({
          url: c.url,
          title: c.title || "",
          snippet: c.snippet || "",
          targetCity: city,
        });
        if (reason) {
          logDebug(`  [SkipCandidate] non-official (${reason}) ${c.url}`);
          return false;
        }
        const d = domainOf(c.url);
        if (!d || usedSet.has(d) || seenCandidateDomains.has(d)) return false;
        seenCandidateDomains.add(d);
        candidateQueue.push(c);
        return true;
      };

      const pushSearchBatch = (rows, state) => {
        let added = 0;
        for (const item of rows || []) {
          const c = {
            ...item,
            searchKeyword: state.searchKeyword,
            categoryIndustry: state.categoryIndustry,
            sourceQuery: state.query,
          };
          if (enqueueCandidate(c)) added += 1;
        }
        return added;
      };

      const fetchNextCandidatesForPlan = async (state) => {
        if (!state || state.exhausted) return 0;
        if (state.fetches >= maxSearchPagesPerPlan) {
          state.exhausted = true;
          return 0;
        }
        const pageForQuery = state.page;
        const offsetForQuery = state.offset;
        state.fetches += 1;
        logInfo(
          `[Search] ${state.query} ${SEARCH_PROVIDER === "brave" ? `(offset=${offsetForQuery})` : `(page=${pageForQuery})`}`
        );

        let rows = [];
        try {
          rows = await searchWeb(state.query, {
            count: searchPageSize,
            page: pageForQuery,
            offset: offsetForQuery,
          });
        } catch (err) {
          state.exhausted = true;
          console.warn(`   Search failed for query "${state.query}" (${String(err?.message || err)}).`);
          return 0;
        }

        if (SEARCH_PROVIDER === "serper") state.page += 1;
        else if (SEARCH_PROVIDER === "brave") state.offset += searchPageSize;
        else state.exhausted = true; // DDG is best-effort first-page only.

        logDebug(`[Search] -> ${rows.length} result(s)`);

        if (!rows.length) {
          state.exhausted = true;
          return 0;
        }

        return pushSearchBatch(rows, state);
      };

      const priorityForPair = PRIORITY_FROM_LOG
        ? priorityCandidatesByPair.get(key) ||
          priorityCandidatesByNormPair.get(priorityPairKey(industry, city) || "__none__") ||
          []
        : [];
      let priorityQueued = 0;
      for (const p of priorityForPair) {
        const c = {
          title: p.title || domainOf(p.url),
          url: p.url,
          snippet: p.snippet || "",
          searchKeyword: p.searchKeyword || canonicalIndustryLabel(industry) || cleanIndustryKeyword(industry) || industry,
          categoryIndustry: p.categoryIndustry || canonicalIndustryLabel(industry) || cleanIndustryKeyword(industry) || industry,
          sourceQuery: p.sourceQuery || "priority_log",
        };
        if (enqueueCandidate(c)) priorityQueued += 1;
      }
      if (priorityQueued) {
        logInfo(`[Priority] queued ${priorityQueued} site(s) from historical log for ${industry} | ${city}`);
      }

      logInfo(`[Candidates] queue=${candidateQueue.length} priority=${priorityQueued} (max crawls=${MAX_SITES_TO_TRY})`);

      let crawledForPair = 0;
      while (pairCounts.get(key) < TARGET_PER_PAIR && crawledForPair < MAX_SITES_TO_TRY) {
        if (!candidateQueue.length) {
          let refilled = false;
          for (let step = 0; step < planStates.length; step++) {
            const idx = (nextPlanCursor + step) % Math.max(1, planStates.length);
            const state = planStates[idx];
            if (!state || state.exhausted) continue;
            const added = await fetchNextCandidatesForPlan(state);
            nextPlanCursor = (idx + 1) % Math.max(1, planStates.length);
            if (added > 0) {
              refilled = true;
              break;
            }
          }
          if (!refilled) break;
        }

        const cand = candidateQueue.shift();
        if (!cand) continue;

        const dom = domainOf(cand.url);
        if (!dom) continue;

        if (usedSet.has(dom)) {
          logDebug(`  [Skip] domain already processed: ${dom}`);
          continue;
        }
        crawledForPair += 1;

        const startedAt = Date.now();
        const result = {
          industry,
          industry_category: cand.categoryIndustry || canonicalIndustryLabel(industry) || cleanIndustryKeyword(industry) || industry,
          search_keyword: cand.searchKeyword || null,
          city,
          url: cand.url,
          domain: dom,
          status: "processing",
        };

        const siteResult = await limitSite(async () => {
          console.log(`\n[Crawl] ${cand.title} (${cand.url})`);
          return await crawlSite(cand.url, { cityHint: city });
        });

        if (!siteResult) {
          logInfo("  [Skip] crawl failed (no HTML)");
          log({ ...result, status: "skip", reason: "crawl_failed", ms: Date.now() - startedAt });
          continue;
        }

        const extractedEmails = Array.from(new Set((siteResult.emails || []).map((e) => normalizeEmail(e)).filter(Boolean))).slice(0, 5);
        if (extractedEmails.length) result.extracted_emails = extractedEmails;
        if (siteResult.public_phone) result.extracted_phone = siteResult.public_phone;
        if (siteResult.public_phone_source) result.extracted_phone_source = siteResult.public_phone_source;

        const nonOfficialReason = nonOfficialSiteReason({
          url: siteResult.base || cand.url,
          title: cand.title || "",
          snippet: cand.snippet || "",
          siteName: siteResult.name || "",
          corpus: siteResult.corpus || "",
          targetCity: city,
        });
        if (nonOfficialReason) {
          logInfo(`  [Skip] non-official site (${nonOfficialReason})`);
          log({ ...result, status: "skip", reason: "non_official_directory_or_platform", nonOfficialReason, ms: Date.now() - startedAt });
          continue;
        }

        logInfo(
          `  [Extract] pages=${siteResult.pagesVisited} | city=${siteResult.city || "?"} | pc=${siteResult.postal_code || "?"} | phone=${
            siteResult.public_phone ? "yes" : "no"
          } | emails=${(siteResult.emails || []).length} | bce=${(siteResult.bces || []).length} | images=${
            (siteResult.imageCandidates || []).length
          }`
        );

        // BCE
        const rawSiteName = siteResult.name ? siteResult.name.split("|")[0].trim() : null;
        let bceCandidates = siteResult.bces.filter(isValidBceModulo97);
        let bceRecoveredByIdentity = null;
        const identityNameForRecovery = normalizeCompanyNameCandidate(rawSiteName || "");
        const identityNameIsUsableForRecovery =
          Boolean(identityNameForRecovery) &&
          !looksLikeNoiseCompanyName(identityNameForRecovery) &&
          !looksLikeSeoCatchallCompanyName(identityNameForRecovery);
        if (!bceCandidates.length && localBceDataset && !identityNameIsUsableForRecovery) {
          logDebug(`  [BCE] local identity recovery skipped (name unusable: ${identityNameForRecovery || "none"})`);
        }
        if (!bceCandidates.length && localBceDataset && identityNameIsUsableForRecovery) {
          const rec = recoverBceFromLocalIdentity({
            dataset: localBceDataset,
            companyName: identityNameForRecovery,
            address: siteResult.address || null,
            postal_code: siteResult.postal_code || null,
            city: siteResult.city || null,
          });
          if (rec?.bce10) {
            bceCandidates = [rec.bce10];
            bceRecoveredByIdentity = rec;
            logInfo(`  [BCE] recovered from local dataset: ${rec.bce10} (score=${rec.score})`);
          }
        }
        if (!bceCandidates.length) {
          logInfo("  [Skip] no valid BCE (mod97) and no local identity match");
          log({
            ...result,
            status: "skip",
            reason: "no_valid_bce_and_no_identity_match",
            ms: Date.now() - startedAt,
            pages: siteResult.pagesVisited,
          });
          continue;
        }
        const bce = bceCandidates[0];
        if (usedBce.has(bce)) {
          logInfo(`  [Skip] BCE already used: ${bce}`);
          log({ ...result, status: "skip", reason: "skip_duplicate_bce", bce, ms: Date.now() - startedAt });
          continue;
        }

        const localBce = enrichFromLocalBce(bce, localBceDataset);
        if (localBceDataset && STRICT_BCE_LOCAL_MATCH && !localBce) {
          logInfo("  [Skip] BCE not found in local BCE dataset");
          log({ ...result, status: "skip", reason: "bce_not_found_local_dataset", bce, ms: Date.now() - startedAt });
          continue;
        }

        let bceStatus = localBce?.bce_status || null;
        let bceSource = localBce?.bce_source || "SCRAPE_SITE";
        let bceVerifiedAt = localBce ? new Date().toISOString() : null;
        const bceLegalName = localBce?.bce_legal_name || null;
        const foundedOn = localBce?.founded_on || null;
        const bceTypeOfEnterprise = localBce?.bce_type_of_enterprise || null;
        const bceJuridicalForm = localBce?.bce_juridical_form || null;
        const bceJuridicalSituation = localBce?.bce_juridical_situation || null;
        const bceSourceUpdateDate = localBce?.bce_source_update_date || null;

        // Verify KBO (optional hard-check)
        if (VERIFY_BCE_WITH_KBO) {
          logInfo("  [KBO] verifying BCE...");
          const v = await verifyBceInKbo(bce);
          if (v.ok !== true) {
            logInfo("  [Skip] BCE not confirmed in KBO");
            log({ ...result, status: "skip", reason: "bce_not_confirmed_kbo", bce, ms: Date.now() - startedAt });
            continue;
          }
          if (!bceStatus && v.status) {
            const kboNorm = normalizeBceForTokenMatch(v.status);
            bceStatus = kboNorm === "ac" || kboNorm === "actif" || kboNorm === "active" || kboNorm === "actief" ? "Actif" : normSpaces(v.status);
          }
          bceSource = localBce ? `${localBce.bce_source}+KBO` : "BCE_OPEN_DATA";
          bceVerifiedAt = new Date().toISOString();
        }

        if (!bceStatus) bceStatus = "Actif";

        if (!bceStatus || !bceStatusIsPublishable(bceStatus)) {
          logInfo(`  [Skip] BCE status not publishable (${bceStatus || "unknown"})`);
          log({ ...result, status: "skip", reason: "bce_status_not_publishable", bce, bceStatus, ms: Date.now() - startedAt });
          continue;
        }

        // Identity (available early for fallback email/images)
        const rawName = rawSiteName;
        const name = pickCompanyName({
          siteName: rawName,
          bceLegalName,
          fallbackTitle: localBce?.bce_legal_name || cand.title || null,
        });
        const industryCategory = cand.categoryIndustry || canonicalIndustryLabel(industry) || cleanIndustryKeyword(industry) || industry;
        const seoIndustryKeyword = cand.searchKeyword || industryCategory;
        const identityNameForValidation = normalizeCompanyNameCandidate(rawName || name || "");
        if (
          localBce &&
          !bceRecoveredByIdentity &&
          bceLegalName &&
          identityNameForValidation &&
          looksLikeSeoCatchallCompanyName(identityNameForValidation) &&
          !sharesCompanyIdentity(identityNameForValidation, bceLegalName)
        ) {
          logInfo("  [Skip] BCE/legal name mismatch with SEO-style site identity");
          log({
            ...result,
            status: "skip",
            reason: "bce_identity_mismatch",
            bce,
            siteName: identityNameForValidation,
            bceLegalName,
            ms: Date.now() - startedAt,
          });
          continue;
        }

        // Real location (do not overwrite the truth)
        const extractedCityRaw = sanitizeCityValue(siteResult.city || null);
        const extractedCity = extractedCityRaw && !isLikelyNoisyCityLabel(extractedCityRaw) ? extractedCityRaw : null;
        if (extractedCityRaw && !extractedCity) {
          logDebug(`  [Geo] ignored noisy extracted city label: ${extractedCityRaw}`);
        }
        const hasRealLocation =
          Boolean(extractedCity) ||
          Boolean(siteResult.postal_code) ||
          Boolean(siteResult.address) ||
          Boolean(localBce?.city) ||
          Boolean(localBce?.postal_code) ||
          Boolean(localBce?.address);

        if (STRICT_CITY_MATCH) {
          const mode = String(LOCATION_MATCH_MODE || "city").toLowerCase();
          const strictCity = extractedCity || sanitizeCityValue(localBce?.city || null);
          const strictPostalCode = siteResult.postal_code || localBce?.postal_code || null;
          const strictProvince = strictPostalCode ? beProvinceFromPostalCode(strictPostalCode) : beProvinceFromCityHint(strictCity || null);

          // In strict mode, do not guess the location if we have nothing reliable.
          if (!hasRealLocation) {
            logInfo("  [Skip] missing reliable location (strict)");
            log({
              ...result,
              status: "skip",
              reason: "geo_unknown_strict",
              bce,
              targetCity: city,
              ms: Date.now() - startedAt,
            });
            continue;
          }

          if (mode === "off" || mode === "none" || mode === "0") {
            // no-op
          } else if (mode === "province") {
            const targetProvince = beProvinceFromCityHint(city);
            const extractedProvince = strictProvince;

            // If we cannot determine the province reliably, fall back to exact city match.
            if (!targetProvince || !extractedProvince) {
              if (strictCity && !sameCity(strictCity, city)) {
                logInfo("  [Skip] province unknown; city mismatch (strict)");
                log({
                  ...result,
                  status: "skip",
                  reason: "province_unknown_strict",
                  bce,
                  targetCity: city,
                  extractedCity: strictCity,
                  postal_code: strictPostalCode,
                  ms: Date.now() - startedAt,
                });
                continue;
              }
            } else if (!sameCity(extractedProvince, targetProvince)) {
              logInfo(`  [Skip] province mismatch (${extractedProvince} != ${targetProvince})`);
              log({
                ...result,
                status: "skip",
                reason: "province_mismatch",
                bce,
                targetCity: city,
                targetProvince,
                extractedCity: strictCity,
                extractedProvince,
                postal_code: strictPostalCode,
                ms: Date.now() - startedAt,
              });
              continue;
            }
          } else if (mode === "region") {
            const targetProvince = beProvinceFromCityHint(city);
            const targetRegion = targetProvince ? beRegionFromProvince(targetProvince) : null;
            const extractedRegion = strictPostalCode
              ? beRegionFromPostalCode(strictPostalCode)
              : strictProvince
                ? beRegionFromProvince(strictProvince)
                : null;

            if (!targetRegion || !extractedRegion) {
              // Best-effort: if missing, accept only when exact city matches.
              if (strictCity && !sameCity(strictCity, city)) {
                logInfo("  [Skip] region unknown; city mismatch (strict)");
                log({
                  ...result,
                  status: "skip",
                  reason: "region_unknown_strict",
                  bce,
                  targetCity: city,
                  extractedCity: strictCity,
                  postal_code: strictPostalCode,
                  ms: Date.now() - startedAt,
                });
                continue;
              }
            } else if (extractedRegion !== targetRegion) {
              logInfo(`  [Skip] region mismatch (${extractedRegion} != ${targetRegion})`);
              log({
                ...result,
                status: "skip",
                reason: "region_mismatch",
                bce,
                targetCity: city,
                targetRegion,
                extractedCity: strictCity,
                extractedRegion,
                postal_code: strictPostalCode,
                ms: Date.now() - startedAt,
              });
              continue;
            }
          } else {
            // Default: exact commune match.
            if (strictCity && !sameCity(strictCity, city)) {
              logInfo(`  [Skip] city mismatch (${strictCity} != ${city})`);
              log({
                ...result,
                status: "skip",
                reason: "city_mismatch",
                bce,
                targetCity: city,
                extractedCity: strictCity,
                postal_code: strictPostalCode,
                ms: Date.now() - startedAt,
              });
              continue;
            }
          }
        }

        let finalCity = sanitizeCityValue(localBce?.city || extractedCity || city, { fallback: city });
        const cityHintClean = sanitizeCityValue(city, { fallback: localBce?.city || null });
        if (finalCity && cityHintClean && sameCity(finalCity, cityHintClean)) {
          const finalNorm = normalizeCityName(finalCity);
          const hintNorm = normalizeCityName(cityHintClean);
          if (finalNorm && hintNorm && finalNorm.length + 1 < hintNorm.length) {
            finalCity = cityHintClean;
          }
        }
        logDebug(`  [Geo] final city = ${finalCity}`);

        if (!name || !finalCity) {
          logInfo("  [Skip] invalid identity/location (name or city missing)");
          log({ ...result, status: "skip", reason: "identity_or_city_missing", bce, name, finalCity, ms: Date.now() - startedAt });
          continue;
        }
        const slugCity = sanitizeCityValue(city, { fallback: finalCity }) || finalCity;
        const slugBase = makeSlug({
          industryKeyword: seoIndustryKeyword,
          city: slugCity,
          name,
        });
        const slug = reserveUniqueSlug(slugBase, usedSlugs);
        if (!/^[a-z0-9]+(?:-[a-z0-9]+)*$/.test(slug)) {
          logInfo("  [Skip] invalid slug format");
          log({ ...result, status: "skip", reason: "invalid_slug_format", bce, slug, ms: Date.now() - startedAt });
          continue;
        }

        // Keep address in a "street only" shape when possible (postal_code/city are separate columns).
        const finalPostalCode = localBce?.postal_code || siteResult.postal_code || null;
        const cleanedSiteAddress = sanitizeAddressText(siteResult.address, { companyName: name });
        const addressCandidate = sanitizeAddressText(localBce?.address, { companyName: name }) || cleanedSiteAddress;
        const addressStreet = stripPostalCityFromAddress(addressCandidate, finalPostalCode, finalCity);
        const website = normalizePublicUrl(siteResult.base || localBce?.website || cand.url);

        // Email
        let email = pickBestEmail(siteResult.emails, website);
        if (email) logDebug("  [Email] found on site");

        // Fallback email (snippets)
        if (!email && RECOVER_EMAIL) {
          const recovered = await tryRecoverEmailFromSnippets(name, city, website);
          if (recovered) {
            email = recovered;
            console.log(`  Recovered email from snippets: ${email}`);
          }
        }

        if (!email && localBce?.contact_email) {
          email = localBce.contact_email;
          logDebug("  [Email] fallback from BCE contact.csv");
        }

        if (!email) {
          logInfo("  [Skip] no email (even after recovery)");
          log({ ...result, status: "skip", reason: "no_email_even_after_recovery", bce, ms: Date.now() - startedAt });
          continue;
        }

        const publicPhone = siteResult.public_phone || localBce?.public_phone || null;
        const contactPhone = publicPhone || null;
        const publicPhoneSource = siteResult.public_phone
          ? siteResult.public_phone_source
          : localBce?.public_phone
            ? "bce_contact"
            : null;

        // Images
        // Vision pre-scoring: runs before Mistral (models are sequential, no VRAM conflict).
        // Scores the top VISION_TOP_N heuristic candidates before final selection.
        const visionCandidateUrls = VISION_ENABLED
          ? [...new Set(
              (siteResult.imageCandidates || [])
                .filter((x) => x?.url && !isBadImageUrl(x.url) && !isLikelyLogoUrl(x.url))
                .sort((a, b) => ((b.bonus || 0) + (b.semantic || 0)) - ((a.bonus || 0) + (a.semantic || 0)))
                .slice(0, VISION_TOP_N)
                .map((x) => x.url)
            )]
          : [];
        if (VISION_ENABLED && visionCandidateUrls.length) {
          logInfo(`  [Vision] scoring ${visionCandidateUrls.length} image candidates...`);
        }
        const visionScores = await scoreImagesWithVision(visionCandidateUrls, { industry: industryCategory });
        if (VISION_ENABLED) {
          const rejected = [...visionScores.values()].filter((v) => v.score <= -9999).length;
          logDebug(`  [Vision] scored=${visionScores.size} rejected=${rejected}`);
        }

        let { cover, gallery } = await pickTopImages(siteResult.imageCandidates, siteResult.logoUrl, { visionScores });
        logDebug(`  [Images] cover=${cover ? "yes" : "no"} gallery=${gallery.length}`);
        const siteImagesCorporate = looksCorporateImagePool(siteResult.imageCandidates);
        if (siteImagesCorporate) {
          logInfo("  [Images] site set looks corporate-heavy; enabling social/image fallback.");
        }

        // Fallback images if gallery insufficient
        if ((!cover || gallery.length < MIN_GALLERY_IMAGES || siteImagesCorporate) && RECOVER_IMAGES) {
          const rec = await tryRecoverImages(name, finalCity, website, {
            socialUrls: [
              siteResult.facebook,
              siteResult.instagram,
              siteResult.linkedin,
              siteResult.tiktok,
              localBce?.facebook,
              localBce?.instagram,
              localBce?.linkedin,
              localBce?.tiktok,
            ].filter(Boolean),
          });
          const recoveredCandidates = Array.isArray(rec?.candidates) ? rec.candidates : [];
          const recoveryBonus = siteImagesCorporate ? 8 : 3;
          const merged = [
            ...(siteResult.imageCandidates || []),
            ...recoveredCandidates.map((x) => ({
              url: x.url,
              bonus: Number(x.bonus || 0) + recoveryBonus,
              semantic: Number(x.semantic || 0),
            })),
            ...(rec.cover ? [{ url: rec.cover, bonus: 8 + recoveryBonus, semantic: 2 }] : []),
            ...(rec.gallery || []).map((u) => ({ url: u, bonus: 2 + recoveryBonus, semantic: 1 })),
          ];
          // Re-score recovered candidates with vision too
          const extraVisionUrls = VISION_ENABLED
            ? recoveredCandidates.map((x) => x.url).filter((u) => u && !visionScores.has(u)).slice(0, 8)
            : [];
          if (extraVisionUrls.length) {
            logInfo(`  [Vision] scoring ${extraVisionUrls.length} recovered image candidates...`);
            const extraScores = await scoreImagesWithVision(extraVisionUrls, { industry: industryCategory });
            for (const [k, v] of extraScores) visionScores.set(k, v);
          }
          ({ cover, gallery } = await pickTopImages(merged, siteResult.logoUrl, { visionScores }));
        }

        // Quality rule: cover required + MIN_GALLERY_IMAGES
        if (!cover || gallery.length < MIN_GALLERY_IMAGES) {
          logInfo(`  [Skip] not enough images (need cover + ${MIN_GALLERY_IMAGES} gallery)`);
          log({
            ...result,
            status: "skip",
            reason: "not_enough_images_even_after_recovery",
            bce,
            ms: Date.now() - startedAt,
            minGallery: MIN_GALLERY_IMAGES,
          });
          continue;
        }

        // LLM (Revelys structure)
        logInfo("  [LLM] structuring (Ollama)...");
        let llm = null;
        let llmConfidence = null;
        let ritualsNormalized = [];
        let whyCompanyQuote = null;
        let services = [];
        let zone = finalCity;
        let pricing_model = null;
        let budget_level = null;
        let price_indication = null;
        const devis_gratuit = siteResult.devisGratuit ? true : null;
        let tagsNormalized = [];
        let servicesText = null;
        let emotionalNeedLabel = null;
        let descriptionLine1 = null;

        let llmFailureReason = "llm_request_failed";
        let llmLastError = null;
        let llmFailureDetail = null;
        let llmRejectedOneLinerRaw = null;
        let llmRejectedOneLinerCleaned = null;
        let llmAttemptsUsed = 0;
        let fallbackCandidateFromOneLinerFailure = null;
        let descriptionLine1FromFallback = false;

        for (let attempt = 1; attempt <= LLM_MAX_ATTEMPTS; attempt++) {
          llmAttemptsUsed = attempt;
          const attemptTemp = llmTemperatureForAttempt(attempt);
          if (attempt > 1) {
            const wait = LLM_RETRY_DELAY_MS * (attempt - 1);
            logInfo(`  [LLM] retry ${attempt}/${LLM_MAX_ATTEMPTS} (temp=${attemptTemp.toFixed(2)})...`);
            if (wait > 0) await sleep(wait);
          }

          let candidate = null;
          try {
            candidate = await enrichWithLLM({
              industry: industryCategory,
              cityHint: finalCity,
              companyName: name,
              corpus: siteResult.corpus,
              priceRangeHint: siteResult.priceRange,
              devisGratuitHint: siteResult.devisGratuit,
              retryAttempt: attempt,
              maxAttempts: LLM_MAX_ATTEMPTS,
              temperature: attemptTemp,
            });
          } catch (err) {
            llmFailureReason = "llm_request_failed";
            llmLastError = String(err?.message || err || "unknown_error").slice(0, 200);
            llmFailureDetail = null;
            continue;
          }

          if (!candidate || !Array.isArray(candidate.tags) || candidate.tags.length < 6 || !Array.isArray(candidate.rituals) || candidate.rituals.length < 3) {
            llmFailureReason = "llm_invalid_output";
            llmFailureDetail = null;
            continue;
          }

          const candidateRituals = normalizeRituals(candidate.rituals);
          const candidateWhyCompany = normalizeWhyCompanyQuote(candidate.why_company);
          if (!candidateWhyCompany || String(candidateWhyCompany).trim().length < 20) {
            llmFailureReason = "why_company_too_short";
            llmFailureDetail = null;
            continue;
          }
          if (!candidateRituals || candidateRituals.length < 3) {
            llmFailureReason = "rituals_invalid";
            llmFailureDetail = null;
            continue;
          }

          let candidateServices = normalizeServicesList(candidate.services_principaux, { max: 6 });
          const candidateZone = clampLine(sanitizePublicFreeText(candidate.zone_intervention || finalCity), 120) || finalCity;
          let candidateTags = normalizeTags(candidate.tags, { max: 12 });
          if (!candidateServices.length) {
            candidateServices = normalizeServicesList(candidateTags, { max: 6 });
          }
          candidateTags = enrichTagsWithServices(candidateTags, candidateServices, { max: 12 });
          const candidateServicesText = servicesListToCsv(candidateServices);
          const candidateEmotionalNeedLabel = normalizeEmotionalNeedLabel(candidate.emotional_need_label, {
            industry: industryCategory,
            services: candidateServices,
          });
          const oneLinerCheck = normalizeSeedDescriptionLine1WithReason(candidate.one_liner, {
            name,
            industry: industryCategory,
            city: candidateZone,
            postal_code: finalPostalCode,
            services: candidateServices,
          });
          const candidateDescriptionLine1 = oneLinerCheck.value;
          if (!candidateDescriptionLine1) {
            llmFailureReason = "description_one_liner_invalid";
            llmFailureDetail = oneLinerCheck.reason || null;
            llmRejectedOneLinerRaw = clampLine(sanitizePublicFreeText(candidate.one_liner), 500) || null;
            llmRejectedOneLinerCleaned = oneLinerCheck.candidate || null;
            fallbackCandidateFromOneLinerFailure = {
              candidate,
              candidateRituals,
              candidateWhyCompany,
              candidateServices,
              candidateZone,
              candidateTags,
              candidateServicesText,
              candidateEmotionalNeedLabel,
            };
            continue;
          }
          llmFailureDetail = null;

          llm = candidate;
          llmConfidence = normalizeLlmConfidence(candidate.confidence);
          ritualsNormalized = candidateRituals;
          whyCompanyQuote = candidateWhyCompany;
          services = candidateServices;
          zone = candidateZone;
          pricing_model = normalizePricingModel(candidate.pricing_model) || pricingModelFromText(siteResult.corpus);
          budget_level = normalizeBudgetLevel(candidate.budget_level) ?? budgetLevelFromPriceRange(siteResult.priceRange);
          price_indication = normalizePriceIndication(candidate.price_indication);
          tagsNormalized = candidateTags;
          servicesText = candidateServicesText;
          emotionalNeedLabel = candidateEmotionalNeedLabel;
          descriptionLine1 = candidateDescriptionLine1;
          descriptionLine1FromFallback = false;
          break;
        }

        if (!llm && fallbackCandidateFromOneLinerFailure) {
          const fb = fallbackCandidateFromOneLinerFailure;
          llm = fb.candidate;
          llmConfidence = normalizeLlmConfidence(fb.candidate.confidence);
          ritualsNormalized = fb.candidateRituals;
          whyCompanyQuote = fb.candidateWhyCompany;
          services = fb.candidateServices;
          zone = fb.candidateZone;
          pricing_model = normalizePricingModel(fb.candidate.pricing_model) || pricingModelFromText(siteResult.corpus);
          budget_level = normalizeBudgetLevel(fb.candidate.budget_level) ?? budgetLevelFromPriceRange(siteResult.priceRange);
          price_indication = normalizePriceIndication(fb.candidate.price_indication);
          tagsNormalized = fb.candidateTags;
          servicesText = fb.candidateServicesText;
          emotionalNeedLabel = fb.candidateEmotionalNeedLabel;
          descriptionLine1 = buildFallbackSeedDescriptionLine1({
            name,
            industry: industryCategory,
            city: fb.candidateZone || finalCity,
            postal_code: finalPostalCode,
            services: fb.candidateServices,
          });
          descriptionLine1FromFallback = Boolean(descriptionLine1);
          if (!descriptionLine1FromFallback) {
            llm = null;
            llmFailureDetail = "fallback_one_liner_build_failed";
          } else {
            llmFailureDetail = `${llmFailureDetail || "one_liner_invalid"}|fallback_used`;
            logInfo("  [LLM] one_liner strict checks failed; fallback template applied.");
          }
        }

        if (!llm) {
          if (llmFailureReason === "description_one_liner_invalid") {
            logInfo("  [Skip] description one_liner invalid (strict)");
            log({
              ...result,
              status: "skip",
              reason: "description_one_liner_invalid",
              reason_detail: llmFailureDetail || "unknown",
              rejected_one_liner_raw: llmRejectedOneLinerRaw || null,
              rejected_one_liner_cleaned: llmRejectedOneLinerCleaned || null,
              bce,
              llm_attempts: llmAttemptsUsed || LLM_MAX_ATTEMPTS,
              ms: Date.now() - startedAt,
            });
          } else if (llmFailureReason === "llm_request_failed") {
            logInfo("  [Skip] LLM request failed");
            log({
              ...result,
              status: "skip",
              reason: "llm_request_failed",
              bce,
              error: llmLastError || "unknown_error",
              llm_attempts: llmAttemptsUsed || LLM_MAX_ATTEMPTS,
              ms: Date.now() - startedAt,
            });
          } else {
            logInfo(`  [Skip] LLM output invalid (${llmFailureReason})`);
            log({
              ...result,
              status: "skip",
              reason: llmFailureReason,
              reason_detail: llmFailureDetail || null,
              bce,
              llm_attempts: llmAttemptsUsed || LLM_MAX_ATTEMPTS,
              ms: Date.now() - startedAt,
            });
          }
          continue;
        }

        if (descriptionLine1FromFallback) {
          logDebug("  [LLM] description line1 fallback used after retries.");
        }

        logInfo(
          `  [LLM] ok (tags=${llm.tags.length}, rituals=${llm.rituals.length}, confidence=${llmConfidence === null ? "?" : llmConfidence})`
        );

        // Assets upload
        let logoUrlFinal = UPLOAD_IMAGES ? null : siteResult.logoUrl;
        let coverFinal = cover;
        let galleryFinal = dedupeImageUrls(gallery, { exclude: cover ? [cover] : [] }).slice(0, MAX_GALLERY_TO_STORE);

        if (UPLOAD_IMAGES) {
          logInfo("  [Images] uploading (WebP) to Supabase Storage...");
          const basePath = `companies/${slug}`;

          // logo (mandatory): combine site logo candidates + social profile-photo candidates.
          const logoScoreByUrl = new Map();
          const addLogoCandidate = (rawUrl, score = 0) => {
            const u = safeUrl(rawUrl);
            if (!u || isBadLogoUrl(u)) return;
            const prev = logoScoreByUrl.get(u);
            if (prev === undefined || score > prev) logoScoreByUrl.set(u, score);
          };
          addLogoCandidate(siteResult.logoUrl, 120);
          const siteLogoCandidates = Array.isArray(siteResult.logoCandidates) ? siteResult.logoCandidates : [];
          siteLogoCandidates.forEach((u, idx) => addLogoCandidate(u, 100 - idx));

          const socialUrlsKnown = [
            siteResult.facebook,
            siteResult.instagram,
            siteResult.linkedin,
            siteResult.tiktok,
            localBce?.facebook,
            localBce?.instagram,
            localBce?.linkedin,
            localBce?.tiktok,
          ]
            .map((u) => normalizeSocialProfileUrl(u))
            .filter(Boolean);
          const socialLogoCandidates = await recoverSocialLogoCandidates(socialUrlsKnown);
          socialLogoCandidates.forEach((x, idx) => addLogoCandidate(x.url, 180 + Number(x.score || 0) - idx));

          const logoSources = Array.from(logoScoreByUrl.entries())
            .sort((a, b) => b[1] - a[1])
            .map(([u]) => u);
          logDebug(`[Images] logo candidates site=${siteLogoCandidates.length} social=${socialLogoCandidates.length} total=${logoSources.length}`);
          // Try good logos first (not white, not dark-bg). Keep the best dark-bg one as fallback.
          let darkBgLogoBuf = null;
          let darkBgLogoSrc = null;
          for (const logoSrc of logoSources) {
            const buf = await downloadAndConvertWebp(logoSrc, 2_500_000);
            if (!buf) continue;
            if (await isLikelyWhiteLogoBuffer(buf)) {
              logDebug(`  [Images] rejected white-on-light logo: ${logoSrc}`);
              continue;
            }
            if (await isLikelyDarkBackgroundLogo(buf)) {
              logDebug(`  [Images] deprioritized dark-bg logo (white text on dark): ${logoSrc}`);
              if (!darkBgLogoBuf) { darkBgLogoBuf = buf; darkBgLogoSrc = logoSrc; }
              continue;
            }
            const up = await uploadImageWebp(`${basePath}/logo.webp`, buf);
            if (up) { logoUrlFinal = up; break; }
          }
          // Fallback: accept dark-bg logo if no clean candidate was found
          if (!logoUrlFinal && darkBgLogoBuf) {
            logDebug(`  [Images] logo fallback to dark-bg candidate: ${darkBgLogoSrc}`);
            const up = await uploadImageWebp(`${basePath}/logo.webp`, darkBgLogoBuf);
            if (up) logoUrlFinal = up;
          }

          // cover (robust): if the picked cover fails, fallback to other image candidates.
          let coverSourceUsed = null;
          let coverHash = null;
          let coverPHash = null;
          {
            const coverCandidates = dedupeImageUrls([cover, ...(Array.isArray(gallery) ? gallery : [])]);
            for (const candidateSrc of coverCandidates) {
              const buf = await downloadAndConvertWebp(candidateSrc, 4_500_000);
              if (!buf) continue;
              if (await isLikelyTransparentGraphic(buf)) {
                logDebug(`  [Images] skipping transparent graphic as cover: ${candidateSrc}`);
                continue;
              }
              const up = await uploadImageWebp(`${basePath}/cover.webp`, buf);
              if (!up) continue;
              coverFinal = up;
              coverSourceUsed = candidateSrc;
              coverHash = sha256Buffer(buf);
              coverPHash = await computePerceptualHash(buf);
              break;
            }
            if (!coverSourceUsed) {
              logInfo("  [Skip] cover upload failed (all candidates)");
              log({ ...result, status: "skip", reason: "cover_upload_fail", bce, ms: Date.now() - startedAt });
              continue;
            }
          }

          // gallery (up to MAX_GALLERY_TO_STORE, but we require MIN_GALLERY_IMAGES)
          const newGallery = [];
          const galleryHashes = new Set();
          const galleryPHashes = [];
          const gallerySources = dedupeImageUrls(Array.isArray(gallery) ? gallery : [], {
            exclude: coverSourceUsed ? [coverSourceUsed] : [],
          });
          let gIndex = 0;
          for (const src of gallerySources) {
            if (newGallery.length >= MAX_GALLERY_TO_STORE) break;
            const buf = await downloadAndConvertWebp(src, 4_500_000);
            if (!buf) continue;
            if (await isLikelyTransparentGraphic(buf)) {
              logDebug(`  [Images] skipping transparent graphic in gallery: ${src}`);
              continue;
            }
            const hash = sha256Buffer(buf);
            const phash = await computePerceptualHash(buf);
            // SHA256 exact dedup
            if (hash && (hash === coverHash || galleryHashes.has(hash))) continue;
            // Perceptual dedup: catches same image at different compression/resize
            if (phash) {
              if (coverPHash && perceptualHashDistance(phash, coverPHash) <= PHASH_DISTANCE_THRESHOLD) continue;
              if (galleryPHashes.some((h) => perceptualHashDistance(phash, h) <= PHASH_DISTANCE_THRESHOLD)) continue;
            }
            const up = await uploadImageWebp(`${basePath}/g${gIndex + 1}.webp`, buf);
            if (up) newGallery.push(up);
            if (up && hash) galleryHashes.add(hash);
            if (up && phash) galleryPHashes.push(phash);
            if (up) gIndex += 1;
          }
          if (newGallery.length < MIN_GALLERY_IMAGES) {
            logInfo(`  [Skip] gallery upload incomplete (${newGallery.length}/${MIN_GALLERY_IMAGES})`);
            log({
              ...result,
              status: "skip",
              reason: "gallery_upload_incomplete",
              bce,
              ms: Date.now() - startedAt,
              minGallery: MIN_GALLERY_IMAGES,
            });
            continue;
          }
          galleryFinal = newGallery.slice(0, MAX_GALLERY_TO_STORE);
          logDebug(`  [Images] uploaded cover + ${galleryFinal.length} gallery images`);
        }

        // Logo is mandatory: never fallback to cover/gallery.
        if (!logoUrlFinal) {
          logInfo("  [Skip] logo missing after extraction/upload");
          log({ ...result, status: "skip", reason: "logo_missing_or_invalid", bce, ms: Date.now() - startedAt });
          continue;
        }

        // Build parseable description (line 1 is the "bio"; following lines are structured)
        const description = buildDescription(
          descriptionLine1,
          servicesText,
          zone,
          llm.premiere_prise_contact,
          llm.piliers
        );

        const hasPublishableAddress = isPublishableAddressStreet(addressStreet);
        const hasCredibleContact = Boolean(email || publicPhone);
        if (STRICT_PUBLISH_QUALITY_GATE) {
          if (!hasPublishableAddress) {
            logInfo("  [Skip] address missing/invalid for publish");
            log({ ...result, status: "skip", reason: "address_missing_or_invalid_for_publish", bce, ms: Date.now() - startedAt });
            continue;
          }
          if (!hasCredibleContact) {
            logInfo("  [Skip] no credible contact for publish (email/phone)");
            log({ ...result, status: "skip", reason: "no_credible_contact_for_publish", bce, ms: Date.now() - startedAt });
            continue;
          }
          if (oneLinerLooksWeak(descriptionLine1)) {
            logInfo("  [Skip] description one_liner weak/template");
            log({
              ...result,
              status: "skip",
              reason: "description_one_liner_weak_template",
              reason_detail: "post_llm_quality_gate",
              rejected_one_liner_cleaned: descriptionLine1 || null,
              bce,
              ms: Date.now() - startedAt,
            });
            continue;
          }
        }

        const facebook = normalizeSocialProfileUrl(siteResult.facebook || localBce?.facebook || null);
        const instagram = normalizeSocialProfileUrl(siteResult.instagram || localBce?.instagram || null);
        const linkedin = normalizeSocialProfileUrl(siteResult.linkedin || localBce?.linkedin || null);
        const tiktok = normalizeSocialProfileUrl(siteResult.tiktok || localBce?.tiktok || null);
        const contactName = clampLine(siteResult.contact_name, 120);
        let founderName = clampLine(siteResult.founder_name, 120);
        let founderRole = clampLine(siteResult.founder_role, 120);
        let founderPhotoUrl = normalizePublicUrl(siteResult.founder_photo_url);
        if (!founderName && RECOVER_FOUNDER_EXTERNAL) {
          const founderRecovered = await tryRecoverFounderFromSnippets({
            companyName: name,
            city: finalCity,
            websiteUrl: website,
            linkedinUrl: linkedin,
          });
          if (founderRecovered?.founder_name) {
            founderName = clampLine(founderRecovered.founder_name, 120);
            founderRole = clampLine(founderRecovered.founder_role, 120) || founderRole;
            if (!founderPhotoUrl) founderPhotoUrl = normalizePublicUrl(founderRecovered.founder_photo_url);
          }
        }
        if (UPLOAD_IMAGES && founderPhotoUrl && !isSupabaseStorageUrl(founderPhotoUrl)) {
          founderPhotoUrl = null;
        }
        if (founderName && (isFounderPlaceholderName(founderName) || !looksLikePersonName(founderName, name))) {
          founderName = null;
          founderRole = null;
          founderPhotoUrl = null;
        }
        if (founderName && !founderRole) founderRole = "Fondateur";
        if (!founderName) {
          founderName = null;
          founderRole = null;
          founderPhotoUrl = null;
        }
        const ideal_zone = zone || clampLine(sanitizePublicFreeText(siteResult.ideal_zone || finalCity), 120) || finalCity;
        const languages = Array.from(new Set([...(Array.isArray(siteResult.languages) ? siteResult.languages : []), "français"]))
          .map((x) => clampLine(x, 40))
          .filter(Boolean)
          .slice(0, 6);
        const availability = clampLine(sanitizePublicFreeText(siteResult.availability || llm.premiere_prise_contact || null), 160);
        const opening_hours = sanitizeOpeningHours(siteResult.opening_hours);
        const google_rating = (() => {
          if (!hasScalarValue(siteResult.google_rating)) return null;
          const n = Number(siteResult.google_rating);
          if (!Number.isFinite(n) || n < 0) return null;
          return Math.min(5, Math.max(0, n));
        })();
        const google_reviews_count = (() => {
          if (hasScalarValue(siteResult.google_reviews_count)) {
            const n = Number(siteResult.google_reviews_count);
            if (Number.isFinite(n) && n >= 0) return Math.round(n);
          }
          if (Array.isArray(siteResult.google_reviews) && siteResult.google_reviews.length) {
            return siteResult.google_reviews.length;
          }
          return null;
        })();
        const google_reviews =
          Array.isArray(siteResult.google_reviews) && siteResult.google_reviews.length ? siteResult.google_reviews : null;
        const companyFaqNormalized = normalizeCompanyFaq(llm.company_faq);
        // LLM FAQ is used directly when it provides enough entries.
        // Fallback templates are only injected when LLM output is insufficient (<3 entries).
        const companyFaqFinal = companyFaqNormalized.length >= 2
          ? companyFaqNormalized.slice(0, 6)
          : (() => {
              const companyFaqFallback = buildFallbackCompanyFaq({
                services,
                zone,
                firstContact: llm.premiere_prise_contact,
                rituals: ritualsNormalized,
                whyCompany: whyCompanyQuote,
                availability,
              });
              const merged = normalizeCompanyFaq([...companyFaqNormalized, ...companyFaqFallback]);
              return (merged.length ? merged : companyFaqFallback).slice(0, 6);
            })();
        const emailDomain = (() => {
          const v = normalizeEmail(email);
          if (!v || !v.includes("@")) return null;
          return v.split("@")[1] || null;
        })();
        const hasCorporateEmail = Boolean(emailDomain && !PUBLIC_EMAIL_PROVIDERS.has(emailDomain));
        const socialCount = [facebook, instagram, linkedin, tiktok].filter(Boolean).length;
        const rasScore = buildCalibratedRasScore({
          llmRasScore: llm.ras_score,
          llmConfidence,
          servicesCount: services.length,
          ritualsCount: ritualsNormalized.length,
          tagsCount: tagsNormalized.length,
          hasCorporateEmail,
          hasEmail: Boolean(email),
          hasPhone: Boolean(publicPhone || contactPhone),
          hasAddress: Boolean(addressStreet),
          hasCity: Boolean(finalCity),
          hasFaq: companyFaqFinal.length >= 4,
          hasWebsite: Boolean(website),
          socialCount,
          galleryCount: Array.isArray(galleryFinal) ? galleryFinal.length : 0,
          hasCover: Boolean(coverFinal),
          recoveredBceByIdentity: Boolean(bceRecoveredByIdentity),
          seed: `${slug}|${bce}|${website || ""}|${email || ""}`,
        });

        // SEO/GEO pack (for public, non-claimed profiles)
        let seo_title = null;
        let seo_description = null;
        let og_title = null;
        let og_description = null;
        let seo_jsonld = null;
        let seo_generated_at = null;
        let seo_ai_used = false;
        let seo_last_inputs_hash = null;

        if (GENERATE_SEO) {
          logInfo("  [SEO] generating pack...");
          const revelys_url = makeCompanyPublicUrl(slug);
          const seoInputs = {
            name,
            industry: seoIndustryKeyword,
            city: finalCity,
            address: addressStreet || null,
            postal_code: finalPostalCode,
            country: "BE",
            website: website || null,
            tags: tagsNormalized,
            services: services,
            rituals: ritualsNormalized,
            why_company: whyCompanyQuote,
            emotional_need_label: emotionalNeedLabel,
            pricing_model,
            budget_level,
            price_indication,
            devis_gratuit,
            logo_url: logoUrlFinal || null,
            cover_image_url: coverFinal || null,
            gallery_urls: galleryFinal || [],
            revelys_url,
          };

          seo_last_inputs_hash = sha256Hex(JSON.stringify(seoInputs));
          seo_generated_at = new Date().toISOString();

          let seoPack = null;
          try {
            seoPack = await enrichSeoWithLLM(seoInputs);
            if (seoPack) seo_ai_used = true;
          } catch {
            seoPack = null;
          }
          if (!seoPack) seoPack = buildFallbackSeoPack(seoInputs);

          seo_title = seoPack?.seo_title || null;
          seo_description = seoPack?.seo_description || null;
          og_title = seoPack?.og_title || null;
          og_description = seoPack?.og_description || null;
          seo_jsonld = seoPack?.seo_jsonld || null;
          logDebug(`  [SEO] ok (ai_used=${seo_ai_used ? "yes" : "no"})`);

          // Ensure JSON-LD uses the canonical public URL + sameAs when available.
          if (seo_jsonld && typeof seo_jsonld === "object" && !Array.isArray(seo_jsonld)) {
            if (!seo_jsonld.url && revelys_url) seo_jsonld.url = revelys_url;
            const sameAsRaw = seo_jsonld.sameAs;
            const sameAsList = (Array.isArray(sameAsRaw) ? sameAsRaw : hasScalarValue(sameAsRaw) ? [sameAsRaw] : [])
              .map((x) => normalizePublicUrl(x))
              .filter(Boolean);
            if (!sameAsList.length && website) sameAsList.push(website);
            if (sameAsList.length) seo_jsonld.sameAs = Array.from(new Set(sameAsList));
            else delete seo_jsonld.sameAs;
          }
        }

        // verification_checks (array for UI)
        const checks = [
          { label: "Seed: site crawl", date: new Date().toISOString(), proof: cand.url },
          { label: "Seed: city hint", date: new Date().toISOString(), proof: city },
          { label: "Seed: industry category", date: new Date().toISOString(), proof: industryCategory },
          { label: "Seed: search keyword", date: new Date().toISOString(), proof: seoIndustryKeyword },
          { label: "Extracted: city", date: new Date().toISOString(), proof: extractedCity || "" },
          { label: "BCE: mod97 valid", date: new Date().toISOString(), proof: bce },
          { label: "Email extracted", date: new Date().toISOString(), proof: email },
          {
            label: `LLM structured (${OLLAMA_MODEL})`,
            date: new Date().toISOString(),
            proof: `confidence=${llmConfidence === null ? "unknown" : `${llmConfidence}/100`}`,
          },
        ];
        if (bceRecoveredByIdentity) {
          checks.push({
            label: "BCE recovered from local identity",
            date: new Date().toISOString(),
            proof: `score=${bceRecoveredByIdentity.score}`,
          });
        }
        if (publicPhone) {
          // Avoid exposing the actual phone number in public-facing verification payloads.
          const src = publicPhoneSource ? `source=${publicPhoneSource}` : "source=unknown";
          checks.push({ label: "Phone extracted", date: new Date().toISOString(), proof: src });
        }
        if (UPLOAD_IMAGES) checks.push({ label: "Images uploaded (Supabase Storage)", date: new Date().toISOString(), proof: coverFinal });

        const row = {
          slug,
          market: MARKET,
          name,
          industry: industryCategory,
          tags: tagsNormalized,
          services: services.length ? services : null,
          pricing_model,
          budget_level,
          price_indication,
          devis_gratuit,
          city: finalCity,
          postal_code: finalPostalCode,
          country: "BE",
          address: addressStreet || null,
          website: website,
          facebook: facebook,
          instagram: instagram,
          linkedin: linkedin,
          tiktok: tiktok,
          contact_email: email,
          contact_name: contactName || null,
          contact_phone: contactPhone,
          public_phone: publicPhone || null,
          founder_name: founderName || null,
          founder_role: founderRole || null,
          founder_photo_url: founderPhotoUrl || null,
          ideal_zone: ideal_zone || null,
          languages: languages.length ? languages : ["français"],
          availability: availability || null,
          opening_hours: opening_hours,
          google_rating: google_rating,
          google_reviews_count: google_reviews_count,
          google_reviews: google_reviews,
          company_faq: companyFaqFinal.length ? companyFaqFinal : null,
          description,
          seo_title,
          seo_description,
          og_title,
          og_description,
          seo_jsonld,
          seo_generated_at,
          seo_ai_used,
          seo_version: SEO_VERSION,
          seo_last_inputs_hash,
          why_company: whyCompanyQuote,
          why_entrepreneur: llm.why_entrepreneur ? normSpaces(llm.why_entrepreneur) : null,
          emotional_need_label: emotionalNeedLabel,
          rituals: ritualsNormalized,
          ras_score: rasScore,
          logo_url: logoUrlFinal || null,
          cover_image_url: coverFinal,
          gallery_urls: galleryFinal,
          video_urls: [],
          video_embed_url: null,
          tier: "listed",
          verification_status: "pending",
          verification_checks: checks,
          content_status: "published",
          is_claimed: false,
          opt_out_status: "none",
          bce_number: localBce?.bce_number || formatBceForUi(bce),
          bce_status: bceStatus,
          bce_source: bceSource,
          bce_verified_at: bceVerifiedAt,
          bce_last_checked_at: bceVerifiedAt,
          bce_legal_name: bceLegalName || name,
          founded_on: foundedOn,
          bce_type_of_enterprise: bceTypeOfEnterprise,
          bce_juridical_form: bceJuridicalForm,
          bce_juridical_situation: bceJuridicalSituation,
          bce_source_update_date: bceSourceUpdateDate,
          created_at: new Date().toISOString(),
        };

        if (!DRY_RUN) {
          fs.appendFileSync(outSql, buildCompanyInsertSQL(row) + "\n\n", "utf-8");
        }

        usedBce.add(bce);
        usedSet.add(dom);
        pairCounts.set(key, pairCounts.get(key) + 1);

        // Stats
        const statsObj = {};
        for (const [k, v] of pairCounts.entries()) statsObj[k] = v;
        fs.writeFileSync(outStats, JSON.stringify(statsObj, null, 2), "utf-8");

        console.log(`  OK (${pairCounts.get(key)}/${TARGET_PER_PAIR}) -> ${name} | BCE ${bce} | ${email}`);
        log({ ...result, status: "ok", slug, bce, email, name, pagesVisited: siteResult.pagesVisited, ms: Date.now() - startedAt });

        if (pairCounts.get(key) >= TARGET_PER_PAIR) {
          console.log(`  Target reached for ${industry} | ${city}`);
        }
      }
      if (pairCounts.get(key) < TARGET_PER_PAIR) {
        const reason =
          crawledForPair >= MAX_SITES_TO_TRY
            ? `max crawls reached (${MAX_SITES_TO_TRY})`
            : "no more eligible search candidates";
        logInfo(`  [Target] incomplete for ${industry} | ${city}: ${pairCounts.get(key)}/${TARGET_PER_PAIR} (${reason})`);
      }
    }
  }

  if (!DRY_RUN && APPEND_PUBLIC_REFRESH_SQL) {
    fs.appendFileSync(
      outSql,
      "-- Refresh public materialized view (companies_public)\nSELECT refresh_companies_public();\n",
      "utf-8"
    );
  }

  console.log("\nDone.");
  console.log("SQL:", outSql);
  console.log("LOG:", outLog);
}

main().catch((e) => {
  console.error("Fatal:", e);
  process.exit(1);
});
