generate-search-index.js

const lunr = require("lunr");
const fs = require("fs");
const path = require("path");
const puppeteer = require("puppeteer");
const http = require("http");
const portfinder = require("portfinder");
const { routes } = require("../src/router/routes");
const { useAcronymAliases } = require("../src/plugins/search/lunr-pipelines");

// collect routes that we want to include in the search index
const routesMap = routes.reduce((map, route) => {
  if (!route.redirect && route.path !== "*" && !route.beforeEnter) {
    // for limited route debugging, use example: `if (route.path === "/products/cnap")`
    map[route.path] = route;
  }
  return map;
}, {});

/**
 * Extracts text content from a puppeteer.Page
 * @param {puppeteer.Page} page the page object to examine
 * @returns {String} page content
 */
const scrapeRouteContent = async (page) => {
  // hide the header to avoid duplication
  const header = await page.$(".page-header");
  if (header) {
    await page.evaluate((header) => {
      header.style["display"] = "none";
    }, header);
  }

  // this is just a simple "grab the text" approach
  // we can get more complicated later if necessary
  return page.$eval("main", (element) => element.innerText);
};

/**
 * Iterates through all the routes to scrape data
 * @param {String} baseUrl
 * @returns {Array[Object]} array of route data
 */
const getRoutesData = async (baseUrl) => {
  const browser = await puppeteer.launch({
    // can set to `false` to open the browser window up for debugging
    headless: true,
  });
  const page = await browser.newPage();

  const allRouteData = [];
  // iterate over all the page routes
  for (const routePath in routesMap) {
    const url = `${baseUrl}${routePath}`;
    console.debug("processing page", url);

    await page.goto(url);
    await page.waitForNetworkIdle();
    const content = await scrapeRouteContent(page);

    // this is the "document" object that will be indexed by lunr
    const routeData = {
      id: routePath,
      // in the index we replace "/"" so that the parts of the path are indexed as separate tokens
      path: routePath.replaceAll("/", " "),
      title:
        routesMap[routePath].meta?.header?.title ||
        routesMap[routePath].meta?.breadcrumb,
      description: routesMap[routePath].meta?.header?.description,
      content,
    };

    allRouteData.push(routeData);
  }

  await browser.close();
  return allRouteData;
};

/**
 * Creates and configures a lunrjs search index from input data
 * @param {Array[Object]} routeData
 * @returns {lunr} lunrjs index
 */
const createLunrIndex = (routeData) => {
  return lunr(function () {
    // index fields
    this.ref("id");
    this.field("path", { boost: 5 });
    this.field("title", { boost: 50 });
    this.field("description", { boost: 10 });
    this.field("content");
    this.metadataWhitelist = ["position"];

    // plugins
    this.use(useAcronymAliases);

    // add data to index
    for (const data of routeData) {
      this.add(data);
    }
  });
};

// iife entrypoint
(async () => {
  // find a free port
  const port = await portfinder.getPortPromise();
  const baseUrl = `http://localhost:${port}`;
  const staticDir = path.join(__dirname, "../dist");

  // start up temp http server on the open port
  // this temp server serves the built site so the scraper can scrape content
  // to populate the search index
  const server = http
    .createServer((req, res) => {
      const requestedPath = staticDir + req.url;

      if (
        fs.existsSync(requestedPath) &&
        fs.lstatSync(requestedPath).isFile()
      ) {
        res.writeHead(200);
        res.end(fs.readFileSync(requestedPath));
      } else {
        res.writeHead(200);
        res.end(fs.readFileSync(`${staticDir}/index.html`));
      }
    })
    .listen(port);

  // scrape text content on pages
  const routeData = await getRoutesData(baseUrl);

  // clean up
  server.close();

  // generate the lunr index from the scraped data
  const index = createLunrIndex(routeData);

  // write the serialized search index/data so that it can be read by the static site
  const outputPath = path.join(__dirname, "../src/assets/data");
  console.debug("writing search index to", outputPath);
  await fs.writeFileSync(
    `${outputPath}/search-index.json`,
    JSON.stringify(index)
  );
  await fs.writeFileSync(
    `${outputPath}/search-data.json`,
    JSON.stringify(routeData)
  );
})();