Skip to content

Commit

Permalink
autoversion: Support CSS selectors (cashapp#394)
Browse files Browse the repository at this point in the history
autoversion currently only supports XPath for searching for versions
in an HTML page.
This works great, but CSS selectors are arguably
more approachable for newcomers, and more widely understood.

This change adds support for CSS selectors to autoversion
using the github.com/andybalholm/cascadia package (BSD2 licensed)
for matching them.

To fit that in, this extracts the search logic into a new abstraction
called htmlMatcher with an implementation for XPath and one for CSS.

Following this change, you can do something like the following:

```hcl
auto-version {
  html {
    url = "https://example.com"
    css = "div.version"
  }
}
```

It is invalid to specify both `xpath` and `css`
in the same auto-version block.
  • Loading branch information
abhinav authored Feb 6, 2024
1 parent 23f33bc commit a5b7ea4
Show file tree
Hide file tree
Showing 10 changed files with 2,532 additions and 30 deletions.
3 changes: 2 additions & 1 deletion docs/docs/packaging/schema/html.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ Used by: [auto-version](../auto-version#blocks)

| Attribute | Type | Description |
|-----------|------|-------------|
| `css` | `string?` | CSS selector for selecting versions from HTML (see https://github.com/andybalholm/cascadia). Only one of xpath or css can be specified. |
| `url` | `string` | URL to retrieve HTML from. |
| `xpath` | `string` | XPath for selecting versions from HTML (see https://github.com/antchfx/htmlquery) - use version-pattern to extract substrings |
| `xpath` | `string?` | XPath for selecting versions from HTML (see https://github.com/antchfx/htmlquery) - use version-pattern to extract substrings |
9 changes: 5 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ require (
github.com/alecthomas/kong v0.5.0
github.com/alecthomas/participle/v2 v2.0.0-beta.5
github.com/alecthomas/repr v0.1.0
github.com/andybalholm/cascadia v1.3.2
github.com/antchfx/htmlquery v1.2.4
github.com/antchfx/xpath v1.2.0
github.com/avvmoto/buf-readerat v0.0.0-20171115124131-a17c8cb89270
Expand All @@ -30,9 +31,9 @@ require (
github.com/willabides/kongplete v0.3.0
github.com/willdonnelly/passwd v0.0.0-20141013001024-7935dab3074c
github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8
golang.org/x/net v0.0.0-20221004154528-8021a29435af
golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211
golang.org/x/net v0.9.0
golang.org/x/sys v0.7.0
golang.org/x/term v0.7.0
howett.net/plist v1.0.0
mvdan.cc/sh v2.6.4+incompatible
)
Expand All @@ -51,6 +52,6 @@ require (
github.com/saracen/solidblock v0.0.0-20190426153529-45df20abab6f // indirect
github.com/ulikunitz/xz v0.5.10 // indirect
golang.org/x/crypto v0.0.0-20220214200702-86341886e292 // indirect
golang.org/x/text v0.3.7 // indirect
golang.org/x/text v0.9.0 // indirect
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f // indirect
)
38 changes: 31 additions & 7 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ github.com/alecthomas/participle/v2 v2.0.0-beta.5/go.mod h1:RC764t6n4L8D8ITAJv0q
github.com/alecthomas/repr v0.0.0-20210801044451-80ca428c5142/go.mod h1:2kn6fqh/zIyPLmm3ugklbEi5hg5wS435eygvNfaDQL8=
github.com/alecthomas/repr v0.1.0 h1:ENn2e1+J3k09gyj2shc0dHr/yjaWSHRlrJ4DPMevDqE=
github.com/alecthomas/repr v0.1.0/go.mod h1:2kn6fqh/zIyPLmm3ugklbEi5hg5wS435eygvNfaDQL8=
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
github.com/antchfx/htmlquery v1.2.4 h1:qLteofCMe/KGovBI6SQgmou2QNyedFUW+pE+BpeZ494=
github.com/antchfx/htmlquery v1.2.4/go.mod h1:2xO6iu3EVWs7R2JYqBbp8YzG50gj/ofqs5/0VZoDZLc=
github.com/antchfx/xpath v1.2.0 h1:mbwv7co+x0RwgeGAOHdrKy89GvHaGvxxBtPK0uF9Zr8=
Expand Down Expand Up @@ -103,40 +105,62 @@ github.com/willdonnelly/passwd v0.0.0-20141013001024-7935dab3074c h1:4+NVyrLUuEm
github.com/willdonnelly/passwd v0.0.0-20141013001024-7935dab3074c/go.mod h1:xcvfY9pOw6s4wyrhilFSbMthL6KzgrfCIETHHUOQ/fQ=
github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 h1:nIPpBwaJSVYIxUFsDv3M8ofmx9yWTog9BfvIu0q41lo=
github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8/go.mod h1:HUYIGzjTL3rfEspMxjDjgmT5uz5wzYJKVo23qUhYTos=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
go.uber.org/goleak v1.1.10 h1:z+mqJhf6ss6BSfSM671tgKyZBFPTTJM+HLxnhPC3wu0=
go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20201221181555-eec23a3978ad/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/crypto v0.0.0-20220214200702-86341886e292 h1:f+lwQ+GtmgoY+A2YaQxlSOnDjXcQ7ZRLWOHbC6HtRqE=
golang.org/x/crypto v0.0.0-20220214200702-86341886e292/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
golang.org/x/lint v0.0.0-20190930215403-16217165b5de h1:5hukYrvBGR8/eNkX5mdUezrA6JiaEZDtJb9Ei+1LlBs=
golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0 h1:LUYupSeNrTNCGzR/hVBk2NHZO4hXcVaW1k4Qx7rjPx8=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220624214902-1bab6f366d9e/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.0.0-20221004154528-8021a29435af h1:wv66FM3rLZGPdxpYL+ApnDe2HzHcTFta3z5nsc13wI4=
golang.org/x/net v0.0.0-20221004154528-8021a29435af/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.9.0 h1:aWJ/m6xSmxWBx+V0XRHTlrYrPG56jKsLdTFmsSsCzOM=
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10 h1:WIoqL4EROvwiPdUtaip4VcDdpZ4kha7wBWZrbVKCIZg=
golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.7.0 h1:3jlCCIQZPdOYu1h8BkNvLz8Kgwtae2cagcG/VamtZRU=
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 h1:JGgROgKl9N8DuW20oFS5gxc+lE67/N3FcwmBPMe7ArY=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/term v0.7.0 h1:BEvjmm5fURWqcfbSKTdpkDXYBrUS1c0m8agp14W48vQ=
golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20191108193012-7d206e10da11 h1:Yq9t9jnGoR+dBuitxdo9l6Q7xh/zOyNnYUtDKaQ3x0E=
golang.org/x/tools v0.0.0-20191108193012-7d206e10da11/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.6.0 h1:BOw41kyTf3PuCW1pVQf8+Cyg8pMlkYB1oo9iJ6D/lKM=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
Expand Down
117 changes: 100 additions & 17 deletions manifest/autoversion/html.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"sort"
"strings"

"github.com/andybalholm/cascadia"
"github.com/antchfx/htmlquery"
"github.com/antchfx/xpath"
"golang.org/x/net/html"
Expand All @@ -30,14 +31,76 @@ func htmlAutoVersion(client *http.Client, autoVersion *manifest.AutoVersionBlock
if err != nil {
return "", errors.Wrapf(err, "%s: could not parse HTML", url)
}
expr, err := xpath.Compile(autoVersion.HTML.XPath)

var matcher htmlMatcher
switch {
case autoVersion.HTML.XPath != "":
matcher, err = compileXPathMatcher(autoVersion.HTML.XPath)
if err != nil {
return "", err
}

case autoVersion.HTML.CSS != "":
matcher, err = compileCSSMatcher(autoVersion.HTML.CSS)
if err != nil {
return "", err
}

default:
return "", errors.Errorf("must specify either xpath or css for auto-version html")
}

candidates, err := matcher.FindAll(node)
if err != nil {
return "", err
}

// Parse and sort versions so we can get the latest.
versions := make(manifest.Versions, 0, len(candidates))
for _, value := range candidates {
value = strings.TrimSpace(value)
groups := versionRe.FindStringSubmatch(value)
if groups == nil {
return "", errors.Errorf("version must match the pattern %s but is %s", autoVersion.VersionPattern, value)
}
versions = append(versions, manifest.ParseVersion(groups[1]))
}
sort.Sort(versions)

if len(versions) == 0 {
return "", errors.Errorf("no versions matched on %s", url)
}

return versions[len(versions)-1].String(), nil
}

// htmlMatcher searches for strings inside an HTML document.
type htmlMatcher interface {
FindAll(n *html.Node) ([]string, error)
}

// htmlXPathMatcher traverses HTML documents using a given XPath expression,
// and returns the text content of the selected nodes.
//
// The xpath expression must match an attribute, text, or element node,
// or produce a string value.
type htmlXPathMatcher struct {
raw string
expr *xpath.Expr
}

// compileXPathMatcher compiles an XPath expression into a matcher.
func compileXPathMatcher(raw string) (*htmlXPathMatcher, error) {
expr, err := xpath.Compile(raw)
if err != nil {
return "", errors.Wrapf(err, "could not compile XPath expression %q", autoVersion.HTML.XPath)
return nil, errors.Wrapf(err, "could not compile XPath expression %q", raw)
}
return &htmlXPathMatcher{raw: raw, expr: expr}, nil
}

// Collect potential candidates here.
func (m *htmlXPathMatcher) FindAll(node *html.Node) ([]string, error) {
var candidates []string
switch matches := expr.Evaluate(htmlquery.CreateXPathNavigator(node)).(type) {
switch matches := m.expr.Evaluate(htmlquery.CreateXPathNavigator(node)).(type) {
case *xpath.NodeIterator:
for matches.MoveNext() {
match := matches.Current()
Expand All @@ -46,28 +109,48 @@ func htmlAutoVersion(client *http.Client, autoVersion *manifest.AutoVersionBlock
candidates = append(candidates, match.Value())

default:
return "", errors.Errorf("XPath query %q did not select a text or attribute node, selected node of type %d", autoVersion.HTML.XPath, match.NodeType())
return nil, errors.Errorf("XPath query %q did not select a text or attribute node, selected node of type %d", m.raw, match.NodeType())
}
}

case string:
candidates = append(candidates, matches)

default:
return "", errors.Errorf("XPath query %q did not select a text value, selected node of type %T", autoVersion.HTML.XPath, matches)
return nil, errors.Errorf("XPath query %q did not select a text value, selected node of type %T", m.raw, matches)
}

// Parse and sort versions so we can get the latest.
versions := make(manifest.Versions, 0, len(candidates))
for _, value := range candidates {
value = strings.TrimSpace(value)
groups := versionRe.FindStringSubmatch(value)
if groups == nil {
return "", errors.Errorf("version must match the pattern %s but is %s", autoVersion.VersionPattern, value)
}
versions = append(versions, manifest.ParseVersion(groups[1]))
return candidates, nil
}

// htmlCSSMatcher traverses HTML documents using a given CSS selector,
// and returns the text content of the selected nodes.
//
// The CSS selector must match a text or element node.
type htmlCSSMatcher struct {
raw string
sel cascadia.Selector
}

// compileCSSMatcher compiles a CSS selector into a matcher.
func compileCSSMatcher(raw string) (*htmlCSSMatcher, error) {
sel, err := cascadia.Compile(raw)
if err != nil {
return nil, errors.Wrapf(err, "could not compile CSS selector %q", raw)
}
return &htmlCSSMatcher{raw: raw, sel: sel}, nil
}

sort.Sort(versions)
return versions[len(versions)-1].String(), nil
func (m *htmlCSSMatcher) FindAll(node *html.Node) ([]string, error) {
var candidates []string
for _, match := range m.sel.MatchAll(node) {
switch match.Type {
case html.TextNode, html.ElementNode:
candidates = append(candidates, strings.TrimSpace(match.FirstChild.Data))

default:
return nil, errors.Errorf("CSS selector %q did not select a text or element node, selected node of type %d", m.raw, match.Type)
}
}
return candidates, nil
}
45 changes: 45 additions & 0 deletions manifest/autoversion/html_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package autoversion

import (
"net/http"
"testing"

"github.com/alecthomas/assert/v2"
"github.com/cashapp/hermit/manifest"
)

func TestHTMLNoVersionsFound(t *testing.T) {
tests := []struct {
name string
htmlBlock *manifest.HTMLAutoVersionBlock
}{
{
name: "XPath",
htmlBlock: &manifest.HTMLAutoVersionBlock{
URL: "http://example.com",
XPath: "/html/body/div",
},
},
{
name: "CSS",
htmlBlock: &manifest.HTMLAutoVersionBlock{
URL: "http://example.com",
CSS: "body > div",
},
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
_, err := htmlAutoVersion(&http.Client{
Transport: testHTTPClient{
path: "testdata/no_versions.html",
},
}, &manifest.AutoVersionBlock{
HTML: tt.htmlBlock,
})
assert.Error(t, err)
assert.Contains(t, err.Error(), "no versions matched")
})
}
}
19 changes: 19 additions & 0 deletions manifest/autoversion/testdata/css.expected.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
description = "Zig is a general-purpose programming language and toolchain for maintaining robust, optimal and reusable software"
strip = 1

linux {
source = "https://ziglang.org/download/${version}/zig-linux-${xarch}-${version}.tar.xz"
}

darwin {
source = "https://ziglang.org/download/${version}/zig-macos-${xarch}-${version}.tar.xz"
}

version "0.10.0" "0.11.0" {
auto-version {
html {
url = "https://ziglang.org/download/"
css = "h2[id^=release-0]"
}
}
}
Loading

0 comments on commit a5b7ea4

Please sign in to comment.