Skip to content

Commit

Permalink
get popularities from docs-internal.popular-pages (github#35583)
Browse files Browse the repository at this point in the history
  • Loading branch information
peterbe authored Mar 15, 2023
1 parent 416c739 commit be30059
Show file tree
Hide file tree
Showing 10 changed files with 103 additions and 1,042 deletions.
1 change: 0 additions & 1 deletion .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
/.github/ @github/docs-engineering
/script/ @github/docs-engineering
/includes/ @github/docs-engineering
/lib/search/popular-pages.json @github/docs-engineering
Dockerfile @github/docs-engineering
package-lock.json @github/docs-engineering
package.json @github/docs-engineering
Expand Down
12 changes: 12 additions & 0 deletions .github/workflows/sync-search-elasticsearch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,14 @@ jobs:
- name: Check out repo
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8

- name: Clone docs-internal.popular-pages
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8
with:
repository: github/docs-internal.popular-pages
# This works because user `docubot` has read access to that private repo.
token: ${{ secrets.DOCUBOT_REPO_PAT }}
path: popular-pages

- name: Clone all translations
if: ${{ matrix.language != 'en' }}
uses: ./.github/actions/clone-translations
Expand Down Expand Up @@ -149,6 +157,10 @@ jobs:
# the same as not set within the script.
VERSION: ${{ github.event.inputs.version }}

# The sync-search-index recognizes this env var if you don't
# use the `--popular-pags <PATH>` option.
POPULAR_PAGES_JSON: popular-pages/records/popular-pages.json

run: |
mkdir /tmp/records
npm run sync-search-indices -- /tmp/records \
Expand Down
13 changes: 12 additions & 1 deletion .github/workflows/sync-search-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ on:
paths:
- 'script/search/**'
- 'package*.json'
- lib/search/popular-pages.json
# Ultimately, for debugging this workflow itself
- .github/workflows/sync-search-pr.yml

Expand All @@ -36,6 +35,14 @@ jobs:
- name: Check out repo
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8

- name: Clone docs-internal.popular-pages
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8
with:
repository: github/docs-internal.popular-pages
# This works because user `docubot` has read access to that private repo.
token: ${{ secrets.DOCUBOT_REPO_PAT }}
path: popular-pages

- uses: ./.github/actions/setup-elasticsearch

- uses: ./.github/actions/node-npm-setup
Expand Down Expand Up @@ -78,6 +85,10 @@ jobs:
# let's just accept an empty string instead.
THROW_ON_EMPTY: false

# The sync-search-index recognizes this env var if you don't
# use the `--popular-pags <PATH>` option.
POPULAR_PAGES_JSON: popular-pages/records/popular-pages.json

run: |
mkdir /tmp/records
npm run sync-search-indices -- /tmp/records \
Expand Down
10 changes: 10 additions & 0 deletions .github/workflows/translation-health-report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,14 @@ jobs:
- name: Checkout the docs-internal repo
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8

- name: Clone docs-internal.popular-pages
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8
with:
repository: github/docs-internal.popular-pages
# This works because user `docubot` has read access to that private repo.
token: ${{ secrets.DOCUBOT_REPO_PAT }}
path: popular-pages

- name: Checkout the language-specific repo
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8
with:
Expand All @@ -74,6 +82,8 @@ jobs:
- uses: ./.github/actions/node-npm-setup

- name: Create translation health report
env:
POPULAR_PAGES_JSON: popular-pages/records/popular-pages.json
run: |
node script/i18n/create-translation-health-report.js \
--language ${{ matrix.language }} \
Expand Down
1,000 changes: 0 additions & 1,000 deletions lib/search/popular-pages.json

This file was deleted.

10 changes: 5 additions & 5 deletions script/i18n/create-translation-health-report.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,12 @@ if (!languageKeys.includes(language)) {

// Gather popularity data the search uses to prioritize errors
const scores = {}
const popularPagesRaw = await fs.readFile('lib/search/popular-pages.json', 'utf8')
for (const line of popularPagesRaw.split('\n')) {
try {
const row = JSON.parse(line)
const { POPULAR_PAGES_JSON } = process.env
if (POPULAR_PAGES_JSON) {
const popularPagesRaw = await fs.readFile(POPULAR_PAGES_JSON, 'utf8')
for (const row of JSON.parse(popularPagesRaw)) {
scores[row.path_article] = row.path_count
} catch {}
}
}

// Load all pages in language
Expand Down
6 changes: 4 additions & 2 deletions script/search/build-records.js
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ export default async function buildRecords(
redirects,
config = {}
) {
const { noMarkers } = config
const { noMarkers, popularPagesFilePath } = config
console.log(`\n\nBuilding records for index '${indexName}' (${languages[languageCode].name})`)
const records = []
const pages = indexablePages
Expand All @@ -53,7 +53,9 @@ export default async function buildRecords(
return permalink
})

const popularPages = await getPopularPages(redirects)
const popularPages = popularPagesFilePath
? await getPopularPages(popularPagesFilePath, redirects)
: {}

console.log('indexable pages', indexablePages.length)
console.log('pages in index', pages.length)
Expand Down
73 changes: 41 additions & 32 deletions script/search/popular-pages.js
Original file line number Diff line number Diff line change
@@ -1,40 +1,49 @@
import fs from 'fs/promises'

const POPULAR_PAGES_JSON = './lib/search/popular-pages.json'
export default async function getPopularPages(filePath, redirects) {
const popularPagesRaw = await fs.readFile(filePath, 'utf-8')

export default async function getPopularPages(redirects) {
const popularPages = {}
try {
const popularPagesRaw = await fs.readFile(POPULAR_PAGES_JSON, 'utf-8')
let biggestCount = 0
for (const line of popularPagesRaw.split('\n')) {
if (!line.trim()) continue
const { path_article: path, path_count: count } = JSON.parse(line)
// The root page or any other potentially dirty record that is empty.
if (!path) continue
// This is safe because the `POPULAR_PAGES_JSON` always lists the
// most popular first.
if (!biggestCount) biggestCount = count
// Don't bother writing massively long floating point numbers
// because reducing it makes the JSON records smaller and we don't
// need any more precision than 7 significant figures.
const ratio = Number((count / biggestCount).toFixed(7))
// The reason we're heeding redirects is because it's very possible
// that the `POPULAR_PAGES_JSON` file is older/"staler" than the
// content itself.
// Imaging our analytics recorded that `/en/foo` had 1,234 pageviews,
// and someone goes and... `git mv content/foo content/bar` plus
// adding `redirect_from: - /foo` into the front-matter.
// Then, by using the redirects first, we can maintain that popularity
// by now "pretending" that it's `/en/bar` that has 1,234 pageviews.
popularPages[redirects[path] || path] = ratio
// Firt iterate through the array of objects, not making an assumption
// that the first one is the biggest one.
const all = {}
for (const { path_article: path, path_count: count } of JSON.parse(popularPagesRaw)) {
if (!path) {
// Can happen if the SQL query is, for some unknown reason, finding
// a path that is either `null` or an empty string. Treat it as a
// junk entry and skip it.
continue
}
if (path === 'index') {
// That's the home page which doesn't count. It doesn't count because
// people don't arrive on that for the information they seek. It's
// merely a navigation tool.
continue
}
} catch (error) {
if (error.code === 'ENOENT') {
console.warn(`The file ${POPULAR_PAGES_JSON} can not be found.`)
} else {
throw error
if (path.startsWith('early-access/')) {
// We never index these anyway so their popularity is never relevant.
continue
}
all[path] = count
}

const biggestCount = Math.max(...Object.values(all))
const popularPages = {}
for (const [path, count] of Object.entries(all)) {
// Don't bother writing massively long floating point numbers
// because reducing it makes the JSON records smaller and we don't
// need any more precision than 7 significant figures.
const ratio = Number((count / biggestCount).toFixed(7))

// The reason we're heeding redirects is because it's possible
// that the JSON file is older/"staler" than the
// content itself.
// Imaging our analytics recorded that `/en/foo` had 1,234 pageviews,
// and someone goes and... `git mv content/foo content/bar` plus
// adding `redirect_from: - /foo` into the front-matter.
// Then, by using the redirects first, we can maintain that popularity
// by now "pretending" that it's `/en/bar` that has 1,234 pageviews.
popularPages[redirects[path] || path] = ratio
}

return popularPages
}
19 changes: 19 additions & 0 deletions script/search/sync-search-indices.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
//
// [end-readme]

import { existsSync } from 'fs'

import assert from 'assert'
import { program, Option } from 'commander'

Expand Down Expand Up @@ -37,6 +39,7 @@ program
)
.option('--no-markers', 'Do not print a marker for each parsed document')
.option('--filter <MATCH>', 'Filter to only do pages that match this string')
.option('-p, --popular-pages <PATH>', 'Popular pages JSON file (defaults to $POPULAR_PAGES_JSON)')
.argument('<out-directory>', 'where the indexable files should be written')
.parse(process.argv)

Expand Down Expand Up @@ -85,6 +88,21 @@ async function main(opts, args) {
}
}

let popularPagesFilePath
const { popularPages } = opts
const { POPULAR_PAGES_JSON } = process.env
if (popularPages) {
if (!existsSync(popularPages)) {
throw new Error(`'${popularPages}' does not exist`)
}
popularPagesFilePath = popularPages
} else if (POPULAR_PAGES_JSON) {
if (!existsSync(POPULAR_PAGES_JSON)) {
throw new Error(`'${POPULAR_PAGES_JSON}' does not exist`)
}
popularPagesFilePath = POPULAR_PAGES_JSON
}

// A `--version` or `process.env.VERSION` was specified, we need to convert
// it to the long name. I.e. `free-pro-team@latest`. Not `dotcom`.
// But it could also have beeb specified as `all` which means that `version`
Expand All @@ -109,6 +127,7 @@ async function main(opts, args) {
const config = {
noMarkers: !opts.markers,
filter: opts.filter,
popularPagesFilePath,
}

const options = {
Expand Down
1 change: 0 additions & 1 deletion tests/meta/repository-references.js
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,6 @@ const IGNORE_PATHS = [
'**/*.graphql', // E.g. data/graphql/ghec/schema.docs.graphql
'package-lock.json', // At the time of writing it's 1.5MB!
'.linkinator/full.log', // Only present if you've run linkinator
'lib/search/popular-pages.json', // used to build search indexes
'tests/**/*.json',
'src/**/*.json', // OpenAPI schema files
'content/early-access', // Not committed to public repository.
Expand Down

0 comments on commit be30059

Please sign in to comment.