const lunr = require("lunr"); const fs = require("fs"); const path = require("path"); const puppeteer = require("puppeteer"); const http = require("http"); const portfinder = require("portfinder"); const { routes } = require("../src/router/routes"); const { useAcronymAliases, configureCustomTokenizer, } = require("../src/plugins/search/lunr-pipelines"); // collect routes that we want to include in the search index const routesMap = routes.reduce((map, route) => { if (!route.redirect && route.path !== "*" && !route.beforeEnter) { // for limited route debugging, use example: `if (route.path === "/products/cnap")` map[route.path] = route; } return map; }, {}); /** * Extracts text content from a puppeteer.Page * @param {puppeteer.Page} page the page object to examine * @returns {String} page content */ const scrapeRouteContent = async (page) => { // hide the header to avoid duplication const header = await page.$(".page-header"); if (header) { await page.evaluate((header) => { header.style["display"] = "none"; }, header); } // this is just a simple "grab the text" approach // we can get more complicated later if necessary return page.$eval("main", (element) => element.innerText); }; /** * Iterates through all the routes to scrape data * @param {String} baseUrl * @returns {Array[Object]} array of route data */ const getRoutesData = async (baseUrl) => { const browser = await puppeteer.launch({ // can set to `false` to open the browser window up for debugging headless: true, }); const page = await browser.newPage(); const allRouteData = []; // iterate over all the page routes for (const routePath in routesMap) { const url = `${baseUrl}${routePath}`; console.debug("processing page", url); await page.goto(url); await page.waitForNetworkIdle(); const content = await scrapeRouteContent(page); // this is the "document" object that will be indexed by lunr const routeData = { id: routePath, // in the index we replace "/"" so that the parts of the path are indexed as separate tokens path: routePath.replaceAll("/", " "), title: routesMap[routePath].meta?.header?.title || routesMap[routePath].meta?.breadcrumb, description: routesMap[routePath].meta?.header?.description, content, }; allRouteData.push(routeData); } await browser.close(); return allRouteData; }; /** * Creates and configures a lunrjs search index from input data * @param {Array[Object]} routeData * @returns {lunr} lunrjs index */ const createLunrIndex = (routeData) => { configureCustomTokenizer(); return lunr(function () { // index fields this.ref("id"); this.field("path", { boost: 5 }); this.field("title", { boost: 15 }); this.field("description", { boost: 10 }); this.field("content"); this.metadataWhitelist = ["position"]; // plugins this.use(useAcronymAliases); // add data to index for (const data of routeData) { this.add(data); } }); }; // iife entrypoint (async () => { // find a free port const port = await portfinder.getPortPromise(); const baseUrl = `http://localhost:${port}`; const staticDir = path.join(__dirname, "../dist"); // start up temp http server on the open port // this temp server serves the built site so the scraper can scrape content // to populate the search index const server = http .createServer((req, res) => { const requestedPath = staticDir + req.url; if ( fs.existsSync(requestedPath) && fs.lstatSync(requestedPath).isFile() ) { res.writeHead(200); res.end(fs.readFileSync(requestedPath)); } else { res.writeHead(200); res.end(fs.readFileSync(`${staticDir}/index.html`)); } }) .listen(port); // scrape text content on pages const routeData = await getRoutesData(baseUrl); // clean up server.close(); // generate the lunr index from the scraped data const index = createLunrIndex(routeData); // write the serialized search index/data so that it can be read by the static site const outputPath = path.join(__dirname, "../src/assets/data"); console.debug("writing search index to", outputPath); await fs.writeFileSync( `${outputPath}/search-index.json`, JSON.stringify(index) ); await fs.writeFileSync( `${outputPath}/search-data.json`, JSON.stringify(routeData) ); })();