/**
 * RAG Utility v1.0
 * Last updated: Jul 14, 2024
 */

import axios from "axios";
import { Readability } from "@mozilla/readability";
import DOMPurify from "dompurify";
import * as cheerio from "cheerio";
import "@tensorflow/tfjs";
import * as use from "@tensorflow-models/universal-sentence-encoder";

const corsProxy = process.env.REACT_APP_CORS_PROXY;
const TIMEOUT = 30 * 1000;
const rssFeeds = {
  "BBC News": "http://feeds.bbci.co.uk/news/rss.xml",
  "The Guardian": "https://www.theguardian.com/international/rss",
  SCMP: "https://www.scmp.com/rss/91/feed",
  France24: "https://www.france24.com/en/rss",
  Xinhua: "http://www.xinhuanet.com/english/rss/worldrss.xml",
  "Sky News": "https://feeds.skynews.com/feeds/rss/home.xml",
  Wired: "https://www.wired.com/feed/rss",
};

const searchNews = async (keywords) => {
  var promises = [];
  for (const [rssName, rssUrl] of Object.entries(rssFeeds)) {
    promises.push(
      fetch(corsProxy + rssUrl, { signal: AbortSignal.timeout(TIMEOUT) }).then(
        async (response) => {
          let parser = new DOMParser();
          let body = await response.text();
          let xmlDoc = parser.parseFromString(body, "text/xml");
          const items = xmlDoc.querySelectorAll("item");
          const rssItems = [];
          items.forEach((item) => {
            rssItems.push({
              title: cleanText(item.querySelector("title").innerHTML),
              url: item.querySelector("link").innerHTML,
              snippet: cleanText(item.querySelector("description").innerHTML),
            });
          });
          return rssItems;
        }
      )
    );
  }
  const responses = await Promise.allSettled(promises);
  var entries = [];
  for (var i = 0; i < responses.length; i++) {
    if (responses[i].status == "fulfilled") {
      entries = entries.concat(responses[i].value);
    }
  }
  const entryTitles = entries.map((entry) => entry.title);
  const similarities = await computeSimilarity(keywords, entryTitles);
  const entriesWithSimilarity = entries.map((entry, i) => ({
    ...entry,
    similarity: similarities[i],
  }));
  entriesWithSimilarity.sort((a, b) => b.similarity - a.similarity);
  return entriesWithSimilarity;
};

const searchWeb = async (keywords) => {
  var res = await axios.get(
    corsProxy + `https://html.duckduckgo.com/html?q=${keywords}`,
    { timeout: TIMEOUT }
  );
  var body = res.data;
  let articles = [];
  var $ = cheerio.load(body);
  $("div.links_main").each((index, element) => {
    let title = $(element).find("h2 a").text();
    let snippet = $(element).find("a.result__snippet").text();
    let ddgUrl = $(element).find("a.result__url").attr("href");
    var ddgUrlParams = new URLSearchParams(ddgUrl.split("?")[1]);

    title = cleanText(title);
    snippet = cleanText(snippet);

    if (!ddgUrl.includes("duckduckgo.com")) {
      // Direct URL
      articles.push({ title: title, snippet: snippet, url: ddgUrl });
    } else {
      // Extract the redirect URL
      var ddgUrlParams = new URLSearchParams(ddgUrl.split("?")[1]);
      var uddg = ddgUrlParams.get("uddg");
      var url = decodeURIComponent(uddg);
      // Check if ads
      if (!url.includes("duckduckgo.com")) {
        articles.push({ title: title, snippet: snippet, url: url });
      }
    }
  });
  return articles;
};

const readArticle = async (url) => {
  const response = await fetch(corsProxy + url);
  const rawHtml = await response.text();
  const cleanHtml = DOMPurify.sanitize(rawHtml).replace(/<\/?a[^>]*>/g, "");
  const parser = new DOMParser();
  const dom = parser.parseFromString(cleanHtml, "text/html");
  var reader = new Readability(dom);
  const content = reader.parse().textContent;
  return cleanText(content);
};

const cleanText = (text) => {
  var cleanedText = text.replace(/\s\s+/g, " "); // multiple spaces
  cleanedText = cleanedText.replace(/(\r\n|\n|\r)+/gm, "\n"); // multiple newlines
  cleanedText = cleanedText.replace(/^\s+|\s+$/g, ""); // trailing spaces and newlines
  cleanedText = cleanedText.replace("<![CDATA[", "").replace("]]>", ""); // CDATA
  cleanedText = cleanedText.replace(/<[^>]*>/g, ""); // HTML elements
  return cleanedText;
};

const computeSimilarity = async (query, entries) => {
  const model = await use.load();
  const emb = await model.embed([query].concat(entries));
  const embs = emb.arraySync();
  const queryEmb = embs[0];
  const entriesEmbs = embs.slice(1);
  const similarities = entriesEmbs.map((entryEmb) => {
    let dotProduct = 0;
    for (let i = 0; i < queryEmb.length; i++) {
      dotProduct += queryEmb[i] * entryEmb[i];
    }
    return dotProduct;
  });
  return similarities;
};

export { rssFeeds, searchNews, searchWeb, readArticle, computeSimilarity };
