Skip to content

Commit

Permalink
Merge branch 'datasift-master'
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolaasuni committed May 1, 2017
2 parents 55555fc + 25101ab commit 7f33872
Show file tree
Hide file tree
Showing 38 changed files with 16,051 additions and 504 deletions.
174 changes: 138 additions & 36 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,91 +1,180 @@
# MAKEFILE
#
# @author Nicola Asuni <[email protected]>
# @link https://github.com/datasift/GoOse
# @author Nicola Asuni <[email protected]>
# @link https://github.com/advancedlogic/GoOse
#
# This file is intended to be executed in a Linux-compatible system.
# It also assumes that the project has been cloned in the right path under GOPATH:
# $GOPATH/src/github.com/advancedlogic/GoOse
#
# ------------------------------------------------------------------------------

# List special make targets that are not associated with files
.PHONY: help all test format fmtcheck vet lint coverage qa deps nuke
.PHONY: help all test format fmtcheck vet lint coverage cyclo ineffassign misspell structcheck varcheck errcheck gosimple astscan qa deps clean nuke

# Ensure everyone is using bash. Note that Ubuntu now uses dash which doesn't support PIPESTATUS.
# Use bash as shell (Note: Ubuntu now uses dash which doesn't support PIPESTATUS).
SHELL=/bin/bash

# name of RPM or DEB package
PKGNAME=GoOse
# CVS path (path to the parent dir containing the project)
CVSPATH=github.com/advancedlogic

# Project owner
OWNER=advancedlogic

# Project vendor
VENDOR=advancedlogic

# Project name
PROJECT=GoOse

# Project version
VERSION=$(shell cat VERSION)

# Name of RPM or DEB package
PKGNAME=${VENDOR}-${PROJECT}

# Current directory
CURRENTDIR=$(shell pwd)

# Go lang path
GOPATH=$(shell readlink -f $(shell pwd)/../../../../)
# GO lang path
ifneq ($(GOPATH),)
ifeq ($(findstring $(GOPATH),$(CURRENTDIR)),)
# the defined GOPATH is not valid
GOPATH=
endif
endif
ifeq ($(GOPATH),)
# extract the GOPATH
GOPATH=$(firstword $(subst /src/, ,$(CURRENTDIR)))
endif

# --- MAKE TARGETS ---

# Display general help about this command
help:
@echo ""
@echo "Welcome to $(PKGNAME) make."
@echo "$(PROJECT) Makefile."
@echo "GOPATH=$(GOPATH)"
@echo "The following commands are available:"
@echo ""
@echo " make qa : Run all the tests"
@echo " make qa : Run all the tests"
@echo " make test : Run the unit tests"
@echo ""
@echo " make test : Run the unit tests"
@echo " make test.short : Run the unit tests with the short option"
@echo " make format : Format the source code"
@echo " make fmtcheck : Check if the source code has been formatted"
@echo " make vet : Check for suspicious constructs"
@echo " make lint : Check for style errors"
@echo " make coverage : Generate the coverage report"
@echo " make cyclo : Generate the cyclomatic complexity report"
@echo " make ineffassign : Detect ineffectual assignments"
@echo " make misspell : Detect commonly misspelled words in source files"
@echo " make structcheck : Find unused struct fields"
@echo " make varcheck : Find unused global variables and constants"
@echo " make errcheck : Check that error return values are used"
@echo " make gosimple : Suggest code simplifications"
@echo " make astscan : GO AST scanner"
@echo ""
@echo " make format : Format the source code"
@echo " make fmtcheck : Check if the source code has been formatted"
@echo " make vet : Check for syntax errors"
@echo " make lint : Check for style errors"
@echo " make coverage : Generate the coverage report"
@echo " make docs : Generate source code documentation"
@echo ""
@echo " make docs : Generate source code documentation"
@echo ""
@echo " make deps : Get the dependencies"
@echo " make nuke : Deletes any intermediate file"
@echo " make deps : Get the dependencies"
@echo " make clean : Remove any build artifact"
@echo " make nuke : Deletes any intermediate file"
@echo ""


# Alias for help target
all: help

# Run the unit tests
test:
@mkdir -p target/test
@mkdir -p target/report
GOPATH=$(GOPATH) go test -covermode=count -coverprofile=target/report/coverage.out -bench=. -race -v ./... | tee >(PATH=$(GOPATH)/bin:$(PATH) go-junit-report > target/test/report.xml); test $${PIPESTATUS[0]} -eq 0

# Run the unit tests with the short option
test.short:
@mkdir -p target/test
GOPATH=$(GOPATH) go test -short -race -v ./... | tee >(PATH=$(GOPATH)/bin:$(PATH) go-junit-report > target/test/report.xml); test $${PIPESTATUS[0]} -eq 0
GOPATH=$(GOPATH) \
go test \
-covermode=atomic \
-bench=. \
-race \
-cpuprofile=target/report/cpu.out \
-memprofile=target/report/mem.out \
-mutexprofile=target/report/mutex.out \
-coverprofile=target/report/coverage.out \
-v ./... | \
tee >(PATH=$(GOPATH)/bin:$(PATH) go-junit-report > target/test/report.xml); \
test $${PIPESTATUS[0]} -eq 0

# Format the source code
format:
@find ./ -type f -name "*.go" -exec gofmt -w {} \;
@find . -type f -name "*.go" -exec gofmt -s -w {} \;

# Check if the source code has been formatted
fmtcheck:
@mkdir -p target
@find ./ -type f -name "*.go" -exec gofmt -d {} \; | tee target/format.diff
@find . -type f -name "*.go" -exec gofmt -s -d {} \; | tee target/format.diff
@test ! -s target/format.diff || { echo "ERROR: the source code has not been formatted - please use 'make format' or 'gofmt'"; exit 1; }

# Check for syntax errors
vet:
GOPATH=$(GOPATH) go vet ./...
GOPATH=$(GOPATH) go vet .

# Check for style errors
lint:
GOPATH=$(GOPATH) PATH=$(GOPATH)/bin:$(PATH) golint ./...
GOPATH=$(GOPATH) PATH=$(GOPATH)/bin:$(PATH) golint .

# Generate the coverage report
coverage:
GOPATH=$(GOPATH) go tool cover -html=target/report/coverage.out -o target/report/coverage.html
@mkdir -p target/report
GOPATH=$(GOPATH) \
go tool cover -html=target/report/coverage.out -o target/report/coverage.html

# Report cyclomatic complexity
cyclo:
@mkdir -p target/report
GOPATH=$(GOPATH) gocyclo -avg ./ | tee target/report/cyclo.txt ; test $${PIPESTATUS[0]} -eq 0

# Detect ineffectual assignments
ineffassign:
@mkdir -p target/report
GOPATH=$(GOPATH) ineffassign ./ | tee target/report/ineffassign.txt ; test $${PIPESTATUS[0]} -eq 0

# Detect commonly misspelled words in source files
misspell:
@mkdir -p target/report
GOPATH=$(GOPATH) misspell -error ./*.go | tee target/report/misspell.txt

# Find unused struct fields
structcheck:
@mkdir -p target/report
GOPATH=$(GOPATH) structcheck -a ./ | tee target/report/structcheck.txt

# Find unused global variables and constants
varcheck:
@mkdir -p target/report
GOPATH=$(GOPATH) varcheck -e ./ | tee target/report/varcheck.txt

# Check that error return values are used
errcheck:
@mkdir -p target/report
GOPATH=$(GOPATH) errcheck ./ | tee target/report/errcheck.txt

# Suggest code simplifications
gosimple:
@mkdir -p target/report
GOPATH=$(GOPATH) gosimple ./ | tee target/report/gosimple.txt

# AST scanner
astscan:
@mkdir -p target/report
GOPATH=$(GOPATH) gas .//*.go | tee target/report/astscan.txt

# Generate source docs
docs:
@mkdir -p target/docs
nohup sh -c 'GOPATH=$(GOPATH) godoc -http=127.0.0.1:6060' > target/godoc_server.log 2>&1 &
wget --directory-prefix=target/docs/ --execute robots=off --retry-connrefused --recursive --no-parent --adjust-extension --page-requisites --convert-links http://127.0.0.1:6060/pkg/github.com/datasift/'${PKGNAME}'/ ; kill -9 `lsof -ti :6060`
echo '<html><head><meta http-equiv="refresh" content="0;./127.0.0.1:6060/pkg/github.com/datasift/'${PKGNAME}'/index.html"/></head><a href="./127.0.0.1:6060/pkg/github.com/datasift/'${PKGNAME}'/index.html">'${PKGNAME}' Documentation ...</a></html>' > target/docs/index.html
wget --directory-prefix=target/docs/ --execute robots=off --retry-connrefused --recursive --no-parent --adjust-extension --page-requisites --convert-links http://127.0.0.1:6060/pkg/github.com/${VENDOR}/${PROJECT}/ ; kill -9 `lsof -ti :6060`
@echo '<html><head><meta http-equiv="refresh" content="0;./127.0.0.1:6060/pkg/'${CVSPATH}'/'${PROJECT}'/index.html"/></head><a href="./127.0.0.1:6060/pkg/'${CVSPATH}'/'${PROJECT}'/index.html">'${PKGNAME}' Documentation ...</a></html>' > target/docs/index.html

# Alias to run targets: fmtcheck test vet lint coverage
qa: fmtcheck test vet lint coverage
# Alias to run all quality-assurance checks
qa: fmtcheck test vet lint coverage cyclo ineffassign misspell structcheck varcheck errcheck gosimple astscan

# --- INSTALL ---

Expand All @@ -95,7 +184,20 @@ deps:
GOPATH=$(GOPATH) go get github.com/golang/lint/golint
GOPATH=$(GOPATH) go get github.com/jstemmer/go-junit-report
GOPATH=$(GOPATH) go get github.com/axw/gocov/gocov
GOPATH=$(GOPATH) go get github.com/fzipp/gocyclo
GOPATH=$(GOPATH) go get github.com/gordonklaus/ineffassign
GOPATH=$(GOPATH) go get github.com/client9/misspell/cmd/misspell
GOPATH=$(GOPATH) go get github.com/opennota/check/cmd/structcheck
GOPATH=$(GOPATH) go get github.com/opennota/check/cmd/varcheck
GOPATH=$(GOPATH) go get github.com/kisielk/errcheck
GOPATH=$(GOPATH) go get honnef.co/go/tools/cmd/gosimple
GOPATH=$(GOPATH) go get github.com/GoASTScanner/gas

# Remove any build artifact
clean:
GOPATH=$(GOPATH) go clean ./...

# Deletes any intermediate file
nuke:
rm -rf ./target
GOPATH=$(GOPATH) go clean -i ./...
14 changes: 7 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
# GoOse

*HTML Content / Article Extractor in Golang*

[![Build Status](https://secure.travis-ci.org/advancedlogic/GoOse.png?branch=master)](https://travis-ci.org/advancedlogic/GoOse?branch=master)
[![Coverage Status](https://coveralls.io/repos/advancedlogic/GoOse/badge.svg?branch=master&service=github)](https://coveralls.io/github/advancedlogic/GoOse?branch=master)
[![Go Report Card](https://goreportcard.com/badge/github.com/advancedlogic/GoOse)](https://goreportcard.com/report/github.com/advancedlogic/GoOse)
[![GoDoc](https://godoc.org/github.com/advancedlogic/GoOse?status.svg)](http://godoc.org/github.com/advancedlogic/GoOse)


Html Content / Article Extractor in Golang
## Description

This is a golang port of "Goose" originaly licensed to Gravity.com
under one or more contributor license agreements. See the NOTICE file
Expand Down Expand Up @@ -62,19 +67,14 @@ To see all available options:
make help
```

To build the project
```bash
make build
```

Before committing the code, please check if it passes all tests using
```bash
make deps
make qa
```

## TODO
- [ ] better organize code
- [ ] add comments and godoc
- [ ] improve "xpath" like queries
- [ ] add other image extractions techniques (imagemagick)

Expand Down
1 change: 1 addition & 0 deletions VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1.1.0
2 changes: 1 addition & 1 deletion article.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package goose

import (
"github.com/advancedlogic/goquery"
"github.com/PuerkitoBio/goquery"
"gopkg.in/fatih/set.v0"
)

Expand Down
92 changes: 92 additions & 0 deletions charset.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package goose

import (
"fmt"
"strings"
"unicode/utf8"

"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
)

// NormaliseCharset Overrides/fixes charset names to something we can parse.
// Fixes common mispellings and uses a canonical name for equivalent encodings.
// @see https://encoding.spec.whatwg.org#names-and-labels
func NormaliseCharset(characterSet string) string {
characterSet = strings.ToUpper(characterSet)
switch characterSet {
case "UTF8", "UT-8", "UTR-8", "UFT-8", "UTF8-WITHOUT-BOM", "UTF8_GENERAL_CI":
return "UTF-8"
// override Japanese
// CP943: IBM OS/2 Japanese, superset of Cp932 and Shift-JIS
case "CP943", "CP943C", "SIFT_JIS", "SHIFT-JIS":
return "SHIFT_JIS"
// override Korean
case "EUC-KR", "MS949", "KSC5601", "WINDOWS-949", "KS_C_5601-1987", "KSC_5601":
return "UHC"
// override Thai
//case "TIS-620", "WINDOWS-874":
// return "ISO-8859-11"
// override latin-2
case "LATIN2_HUNGARIAN_CI", "LATIN2":
return "LATIN-2"
// override cyrillic
case "WIN1251", "WIN-1251", "WINDOWS-1251":
return "CP1251"
// override Hebrew
case "WINDOWS-1255":
return "ISO-8859-8"
// override Turkish
//case "WINDOWS-1254":
// return "ISO-8859-9"
// override the parsing of ISO-8859-1 to behave as Windows-1252 (CP1252):
// in ISO-8859-1, everything from 128-255 in the ASCII table are ctrl characters,
// whilst in CP1252 they're symbols
// override Baltic
case "WINDOWS-1257":
return "ISO-8859-13"
case "ANSI", "LATIN-1", "ISO", "RFC", "MACINTOSH", "8859-1", "8859-15", "ISO8859-1", "ISO8859-15", "ISO-8559-1", "ISO-8859-1", "ISO-8859-15":
return "CP1252"
}
return characterSet
}

// UTF8encode converts a string from the source character set to UTF-8, skipping invalid byte sequences
// @see http://stackoverflow.com/questions/32512500/ignore-illegal-bytes-when-decoding-text-with-go
func UTF8encode(raw string, sourceCharset string) string {
enc, name := charset.Lookup(sourceCharset)
if nil == enc {
fmt.Println("Cannot convert from", sourceCharset, ":", name)
return raw
}

dst := make([]byte, len(raw))
d := enc.NewDecoder()

var (
in int
out int
)
for in < len(raw) {
// Do the transformation
ndst, nsrc, err := d.Transform(dst[out:], []byte(raw[in:]), true)
in += nsrc
out += ndst
if err == nil {
// Completed transformation
break
}
if err == transform.ErrShortDst {
// Our output buffer is too small, so we need to grow it
t := make([]byte, (cap(dst)+1)*2)
copy(t, dst)
dst = t
continue
}
// We're here because of at least one illegal character. Skip over the current rune
// and try again.
_, width := utf8.DecodeRuneInString(raw[in:])
in += width
}
return string(dst)
}
Loading

0 comments on commit 7f33872

Please sign in to comment.