Skip to content

Commit

Permalink
Run V1 detector, bulk court app type
Browse files Browse the repository at this point in the history
  • Loading branch information
pkierski committed Nov 27, 2024
1 parent 1d4fff6 commit 3cabace
Show file tree
Hide file tree
Showing 14 changed files with 335 additions and 176 deletions.
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@ go 1.23.0

require (
github.com/PuerkitoBio/goquery v1.10.0
github.com/hashicorp/go-retryablehttp v0.7.7
github.com/stretchr/testify v1.9.0
golang.org/x/net v0.31.0
golang.org/x/sync v0.1.0
golang.org/x/sync v0.9.0
)

require (
github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/hashicorp/go-cleanhttp v0.5.2 // indirect
github.com/hashicorp/go-retryablehttp v0.7.7 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
13 changes: 12 additions & 1 deletion go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,18 @@ github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsVi
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM=
github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE=
github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ=
github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48=
github.com/hashicorp/go-hclog v1.6.3 h1:Qr2kF+eVWjTiYmU7Y31tYlP1h0q/X3Nl3tPGdaB11/k=
github.com/hashicorp/go-hclog v1.6.3/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVHBcfoyhpF5M=
github.com/hashicorp/go-retryablehttp v0.7.7 h1:C8hUCYzor8PIfXHa4UrZkU4VvK8o9ISHxT2Q8+VepXU=
github.com/hashicorp/go-retryablehttp v0.7.7/go.mod h1:pkQpWZeYWskR+D1tR2O5OcBFOxfA7DoAO6xtkuQnHTk=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
Expand All @@ -26,15 +34,18 @@ golang.org/x/net v0.31.0 h1:68CPQngjLL0r2AlUKiSxtQFKvzRVbnzLwMUn5SzcLHo=
golang.org/x/net v0.31.0/go.mod h1:P4fl1q7dY2hnZFxEk4pPSkDHF+QqjitcnDjUQyMM+pM=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ=
golang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s=
golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
Expand Down
46 changes: 43 additions & 3 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,51 @@ import (
"fmt"
"os"
"slices"
"strings"

"github.com/hashicorp/go-retryablehttp"
"github.com/pkierski/wokanda-scrapper/pkg/data"
"github.com/pkierski/wokanda-scrapper/pkg/trialdownloader"
)

func main() {
client := retryablehttp.NewClient()
// bulktest.BulkV1Test(context.Background(), client.StandardClient())

courtsData := trialdownloader.DetectBulk(context.Background(), client.StandardClient(), data.Domains)
f1, err := os.Create("courts.json")
if err != nil {
panic(err)
}
defer f1.Close()
encoder := json.NewEncoder(f1)
encoder.SetIndent("", " ")
encoder.Encode(courtsData)

return

// domains, v1Results := bulktest.BulkV1Test(context.Background(), client.StandardClient())

// f1, err := os.Create("v1_with_false.csv")
// if err != nil {
// panic(err)
// }
// defer f1.Close()
// for i, d := range domains {
// fmt.Fprintf(f1, "%v,%v\n", d, v1Results[i])
// }

// f2, err := os.Create("v1_with_empty.csv")
// if err != nil {
// panic(err)
// }
// defer f2.Close()
// for i, d := range domains {
// if !v1Results[i] {
// d = ""
// }
// fmt.Fprintln(f2, d)
// // fmt.Fprintf(f2, "%v: %v\n", d, v1Results[i])
// }
// return

if len(os.Args) < 2 {
Expand All @@ -22,14 +59,17 @@ func main() {
}

// TODO: use constructor based on url
downloader := trialdownloader.NewV2Wokanda(client.StandardClient(), os.Args[1])
downloader := trialdownloader.NewV1Wokanda(client.StandardClient(), os.Args[1])

trials, err := downloader.Download(context.Background(), "2006-01-02")
trials, err := downloader.Download(context.Background(), "2024-11-27")
if err != nil {
panic(err)
}

slices.SortFunc(trials, func(a, b trialdownloader.Trial) int {
if c := strings.Compare(a.CaseID, b.CaseID); c != 0 {
return c
}
return a.Date.Compare(b.Date)
})
j, _ := json.MarshalIndent(trials, "", " ")
Expand Down
39 changes: 18 additions & 21 deletions pkg/bulktest/bulkV1.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ import (
"os"
"path/filepath"
"slices"
"sync"
"time"

"github.com/pkierski/wokanda-scrapper/pkg/data"
"github.com/pkierski/wokanda-scrapper/pkg/trialdownloader"
"golang.org/x/sync/errgroup"
)
Expand All @@ -22,41 +24,36 @@ type resultType struct {
DateAquired time.Time `json:"date_aquired"`
}

func BulkV1Test(ctx context.Context, client *http.Client) {
func BulkV1Test(ctx context.Context, client *http.Client) ([]string, []bool) {
eg, taskCtx := errgroup.WithContext(ctx)
eg.SetLimit(16)
eg.SetLimit(128)

results := make(map[string]resultType)
resultsCh := make(chan resultType)

go func() {
for result := range resultsCh {
results[result.Url] = result
writeResultV1(result)
}
}()

// domains := slices.Clone(data.Domains)
domains := slices.Clone(data.Domains)
// TODO: remove already checked
domains := []string{"legnica.so.gov.pl"}
// domains := []string{"legnica.so.gov.pl"}
resultsV1 := make([]bool, len(domains))
var resultsV1Mu sync.Mutex

for _, url := range domains {
eg.Go(func() error {
url = "https://" + url
log.Printf("starting %v", url)
defer log.Printf("finished %v", url)

var result resultType
result.Url = url
downloader := trialdownloader.NewV1Wokanda(client, fmt.Sprintf("https://%v", url))
result.Trials, result.Err = downloader.Download(taskCtx, "2006-02-01")
result.DateAquired = time.Now().UTC()
resultsCh <- result
r := trialdownloader.Detect(taskCtx, client, url)
if len(r) == 1 && r[0] == trialdownloader.AppTypeV1 {
i := slices.Index(domains, url)
resultsV1Mu.Lock()
resultsV1[i] = true
resultsV1Mu.Unlock()
}
return nil
})
}

eg.Wait()
close(resultsCh)

return domains, resultsV1
}

func writeResultV1(result resultType) {
Expand Down
10 changes: 5 additions & 5 deletions pkg/data/domains.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ suwalki.sr.gov.pl
gdansk.sa.gov.pl
bydgoszcz.so.gov.pl
bydgoszcz.sr.gov.pl
inowroclaw.sr.gov.pl
bip.inowroclaw.sr.gov.pl
mogilno.sr.gov.pl
naklo.sr.gov.pl
szubin.sr.gov.pl
Expand All @@ -51,7 +51,7 @@ gdansk.so.gov.pl
gdansk-poludnie.sr.gov.pl
gdansk-polnoc.sr.gov.pl
gdynia.sr.gov.pl
kartuzy.sr.gov.pl
www.kartuzy.sr.gov.pl
koscierzyna.sr.gov.pl
kwidzyn.sr.gov.pl
malbork.sr.gov.pl
Expand Down Expand Up @@ -318,7 +318,7 @@ strzelce-kraj.sr.gov.pl
sulecin.sr.gov.pl
miedzyrzecz.sr.gov.pl
waw.sa.gov.pl
warszawa.so.gov.pl
bip.warszawa.so.gov.pl
warszawa.sr.gov.pl
warszawa-srodmiescie.sr.gov.pl
warszawa-zoliborz.sr.gov.pl
Expand All @@ -333,7 +333,7 @@ nowydwormaz.sr.gov.pl
otwock.sr.gov.pl
warszawa-pragapoludnie.sr.gov.pl
warszawapraga-pln.sr.gov.pl
wolomin.sr.gov.pl
bip.wolomin.sr.gov.pl
wroclaw.sa.gov.pl
jelenia-gora.so.gov.pl
jelenia-gora.sr.gov.pl
Expand All @@ -345,7 +345,7 @@ boleslawiec.sr.gov.pl
legnica.so.gov.pl
legnica.sr.gov.pl
glogow.sr.gov.pl
jawor.gov.pl
jawor.sr.gov.pl
lubin.sr.gov.pl
zlotoryja.sr.gov.pl
opole.so.gov.pl
Expand Down
10 changes: 0 additions & 10 deletions pkg/detect/type.go

This file was deleted.

12 changes: 0 additions & 12 deletions pkg/detect/v1.go

This file was deleted.

7 changes: 7 additions & 0 deletions pkg/trialdownloader/construct.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package trialdownloader

import "net/http"

func NewDownloader(client *http.Client, baseUrl string) Downloader {
return nil
}
77 changes: 77 additions & 0 deletions pkg/trialdownloader/detector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package trialdownloader

import (
"bytes"
"context"
"fmt"
"net/http"
"sync"

"golang.org/x/sync/errgroup"
)

type CourtData struct {
Domain string `json:"domain"`
AppTypes []AppType `json:"app_types"`
}

const (
AppTypeV1 AppType = "V1:url/wokanda,I"
)

func Detect(ctx context.Context, client *http.Client, baseUrl string) (result []AppType) {
detectors := [](func(ctx context.Context, client *http.Client, baseUrl string) (bool, AppType)){
detectV1,
}

var resultMu sync.Mutex

eg, ctx := errgroup.WithContext(ctx)
for _, d := range detectors {
eg.Go(func() error {
if found, typ := d(ctx, client, baseUrl); found {
resultMu.Lock()
result = append(result, typ)
resultMu.Unlock()
}
return nil
})
}

eg.Wait()

return
}

func DetectBulk(ctx context.Context, client *http.Client, domains []string) []CourtData {
courts := make([]CourtData, len(domains))
eg, taskCtx := errgroup.WithContext(ctx)

for i, domain := range domains {
eg.Go(func() error {
court := CourtData{
Domain: domain,
AppTypes: append(make([]AppType, 0), Detect(taskCtx, client, domain)...),
}
courts[i] = court
return nil
})
}

eg.Wait()
return courts
}

func detectV1(ctx context.Context, client *http.Client, baseUrl string) (found bool, typ AppType) {
typ = AppTypeV1
page, err := getOne(ctx, client, fmt.Sprintf("https://%v/wokanda", baseUrl))
if err != nil {
return
}

found = bytes.Contains(page, []byte(`<form action="index.php" method="GET" class="cases-form">`)) &&
bytes.Contains(page, []byte(`<input name="p" type="hidden" value="cases"`)) &&
bytes.Contains(page, []byte(`<input name="action" type="hidden" value="search"`))

return
}
Loading

0 comments on commit 3cabace

Please sign in to comment.