/**
 * OpenRights — Title Deduplicator
 *
 * Detects duplicate titles in a Google Sheet using:
 *   1. EIDR exact match
 *   2. ISAN exact match
 *   3. Normalized title + year match
 *   4. Levenshtein distance (≤ 2) on normalized title + same year
 *
 * Expected input sheet: "Titles Input"
 * Columns (row 1 = headers, data starts row 2):
 *   A: Title ID
 *   B: Title
 *   C: Original Title
 *   D: Year
 *   E: EIDR
 *   F: ISAN
 *
 * Output sheet: "Dedupe Candidates" (created / overwritten).
 *
 * Usage: Extensions → Apps Script → paste this file → Run findDuplicateTitles().
 *
 * openrights.blog
 */

const INPUT_SHEET = "Titles Input";
const OUTPUT_SHEET = "Dedupe Candidates";
const FUZZY_MAX_DISTANCE = 2;

function findDuplicateTitles() {
  const ss = SpreadsheetApp.getActiveSpreadsheet();
  const input = ss.getSheetByName(INPUT_SHEET);
  if (!input) {
    SpreadsheetApp.getUi().alert("No 'Titles Input' sheet found. Create one with columns: Title ID, Title, Original Title, Year, EIDR, ISAN.");
    return;
  }

  const lastRow = input.getLastRow();
  if (lastRow < 2) return;

  const rows = input.getRange(2, 1, lastRow - 1, 6).getValues();
  const records = rows.map((r, i) => ({
    rowIndex: i + 2,
    id: String(r[0] || "").trim(),
    title: String(r[1] || "").trim(),
    originalTitle: String(r[2] || "").trim(),
    year: String(r[3] || "").trim(),
    eidr: String(r[4] || "").trim(),
    isan: String(r[5] || "").trim(),
    normalized: normalizeTitle(r[1]),
  })).filter(r => r.title);

  const candidates = [];

  for (let i = 0; i < records.length; i++) {
    for (let j = i + 1; j < records.length; j++) {
      const a = records[i];
      const b = records[j];
      const reasons = [];

      if (a.eidr && b.eidr && a.eidr === b.eidr) reasons.push("EIDR match");
      if (a.isan && b.isan && a.isan === b.isan) reasons.push("ISAN match");
      if (a.normalized && b.normalized && a.normalized === b.normalized && a.year === b.year) {
        reasons.push("Normalized title + year match");
      } else if (a.normalized && b.normalized && a.year === b.year) {
        const d = levenshtein(a.normalized, b.normalized);
        if (d > 0 && d <= FUZZY_MAX_DISTANCE) {
          reasons.push("Fuzzy match (distance " + d + ")");
        }
      }

      if (reasons.length > 0) {
        candidates.push([
          a.id, a.title, b.id, b.title, a.year, reasons.join("; "),
        ]);
      }
    }
  }

  let out = ss.getSheetByName(OUTPUT_SHEET);
  if (!out) out = ss.insertSheet(OUTPUT_SHEET);
  out.clear();
  out.getRange(1, 1, 1, 6).setValues([["Title A ID", "Title A", "Title B ID", "Title B", "Year", "Match Reason"]]);
  out.getRange(1, 1, 1, 6).setFontWeight("bold").setBackground("#0D7D74").setFontColor("#FFFFFF");

  if (candidates.length > 0) {
    out.getRange(2, 1, candidates.length, 6).setValues(candidates);
  } else {
    out.getRange(2, 1).setValue("No duplicate candidates found.");
  }
  out.autoResizeColumns(1, 6);
  SpreadsheetApp.getUi().alert("Found " + candidates.length + " dedupe candidate pair(s). See '" + OUTPUT_SHEET + "' sheet.");
}

function normalizeTitle(raw) {
  if (!raw) return "";
  return String(raw)
    .toLowerCase()
    .replace(/^the\s+/i, "")
    .replace(/^a\s+/i, "")
    .replace(/^an\s+/i, "")
    .replace(/[:;,.!?'"\-–—]/g, "")
    .replace(/\s+/g, " ")
    .trim();
}

/** Classic dynamic-programming Levenshtein distance. */
function levenshtein(a, b) {
  if (a === b) return 0;
  if (!a.length) return b.length;
  if (!b.length) return a.length;

  const dp = Array(b.length + 1).fill(0).map(() => Array(a.length + 1).fill(0));
  for (let i = 0; i <= a.length; i++) dp[0][i] = i;
  for (let j = 0; j <= b.length; j++) dp[j][0] = j;

  for (let j = 1; j <= b.length; j++) {
    for (let i = 1; i <= a.length; i++) {
      const cost = a[i - 1] === b[j - 1] ? 0 : 1;
      dp[j][i] = Math.min(
        dp[j][i - 1] + 1,
        dp[j - 1][i] + 1,
        dp[j - 1][i - 1] + cost,
      );
    }
  }
  return dp[b.length][a.length];
}
