Newer
Older
const lunr = require("lunr");
const fs = require("fs");
const path = require("path");
const puppeteer = require("puppeteer");
const http = require("http");
const portfinder = require("portfinder");
const { routes } = require("../src/router/routes");
const { useAcronymAliases } = require("../src/plugins/search/lunr-pipelines");
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
// collect routes that we want to include in the search index
const routesMap = routes.reduce((map, route) => {
if (!route.redirect && route.path !== "*" && !route.beforeEnter) {
// for limited route debugging, use example: `if (route.path === "/products/cnap")`
map[route.path] = route;
}
return map;
}, {});
/**
* Extracts text content from a puppeteer.Page
* @param {puppeteer.Page} page the page object to examine
* @returns {String} page content
*/
const scrapeRouteContent = async (page) => {
// hide the header to avoid duplication
const header = await page.$(".page-header");
if (header) {
await page.evaluate((header) => {
header.style["display"] = "none";
}, header);
}
// this is just a simple "grab the text" approach
// we can get more complicated later if necessary
return page.$eval("main", (element) => element.innerText);
};
/**
* Iterates through all the routes to scrape data
* @param {String} baseUrl
* @returns {Array[Object]} array of route data
*/
const getRoutesData = async (baseUrl) => {
const browser = await puppeteer.launch({
// can set to `false` to open the browser window up for debugging
headless: true,
});
const page = await browser.newPage();
const allRouteData = [];
// iterate over all the page routes
for (const routePath in routesMap) {
const url = `${baseUrl}${routePath}`;
console.debug("processing page", url);
await page.goto(url);
await page.waitForNetworkIdle();
const content = await scrapeRouteContent(page);
// this is the "document" object that will be indexed by lunr
const routeData = {
id: routePath,
// in the index we replace "/"" so that the parts of the path are indexed as separate tokens
path: routePath.replaceAll("/", " "),
title:
routesMap[routePath].meta?.header?.title ||
routesMap[routePath].meta?.breadcrumb,
description: routesMap[routePath].meta?.header?.description,
content,
};
allRouteData.push(routeData);
}
await browser.close();
return allRouteData;
};
/**
* Creates and configures a lunrjs search index from input data
* @param {Array[Object]} routeData
* @returns {lunr} lunrjs index
*/
const createLunrIndex = (routeData) => {
return lunr(function () {
// index fields
this.ref("id");
this.field("path", { boost: 5 });
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
this.field("description", { boost: 10 });
this.field("content");
this.metadataWhitelist = ["position"];
// plugins
this.use(useAcronymAliases);
// add data to index
for (const data of routeData) {
this.add(data);
}
});
};
// iife entrypoint
(async () => {
// find a free port
const port = await portfinder.getPortPromise();
const baseUrl = `http://localhost:${port}`;
const staticDir = path.join(__dirname, "../dist");
// start up temp http server on the open port
// this temp server serves the built site so the scraper can scrape content
// to populate the search index
const server = http
.createServer((req, res) => {
const requestedPath = staticDir + req.url;
if (
fs.existsSync(requestedPath) &&
fs.lstatSync(requestedPath).isFile()
) {
res.writeHead(200);
res.end(fs.readFileSync(requestedPath));
} else {
res.writeHead(200);
res.end(fs.readFileSync(`${staticDir}/index.html`));
}
})
.listen(port);
// scrape text content on pages
const routeData = await getRoutesData(baseUrl);
// clean up
server.close();
// generate the lunr index from the scraped data
const index = createLunrIndex(routeData);
// write the serialized search index/data so that it can be read by the static site
const outputPath = path.join(__dirname, "../src/assets/data");
console.debug("writing search index to", outputPath);
await fs.writeFileSync(
`${outputPath}/search-index.json`,
JSON.stringify(index)
);
await fs.writeFileSync(
`${outputPath}/search-data.json`,
JSON.stringify(routeData)
);
})();