Skip to content

Commit

Permalink
v5 (spatie#318)
Browse files Browse the repository at this point in the history
* wip

* wip

* cleanup

* wip

* move classes to correct namespaces

* wip

* wip

* wip

* Fix styling

* wip

* wip

* wip

* Fix styling

Co-authored-by: freekmurze <[email protected]>
  • Loading branch information
freekmurze and freekmurze authored Sep 29, 2020
1 parent 1f954a5 commit 279b283
Show file tree
Hide file tree
Showing 39 changed files with 714 additions and 473 deletions.
29 changes: 29 additions & 0 deletions .github/workflows/php-cs-fixer.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Code style

on: [push]

jobs:
style:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v2

- name: Fix style
uses: docker://oskarstark/php-cs-fixer-ga
with:
args: --config=.php_cs --allow-risky=yes

- name: Extract branch name
shell: bash
run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
id: extract_branch

- name: Commit changes
uses: stefanzweifel/[email protected]
with:
commit_message: Fix styling
branch: ${{ steps.extract_branch.outputs.branch }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
8 changes: 4 additions & 4 deletions .github/workflows/run-tests.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: run-tests
name: Tests

on: [push, pull_request]

Expand All @@ -9,14 +9,14 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-latest]
php: [7.4, 7.3, 7.2, 7.1]
php: [7.4]
dependency-version: [prefer-lowest, prefer-stable]

name: P${{ matrix.php }} - ${{ matrix.dependency-version }} - ${{ matrix.os }}

steps:
- name: Checkout code
uses: actions/checkout@v1
uses: actions/checkout@v2

- name: Install Puppeteer
run: npm install puppeteer
Expand All @@ -31,7 +31,7 @@ jobs:
run: sleep 5

- name: Cache dependencies
uses: actions/cache@v1
uses: actions/cache@v2
with:
path: ~/.composer/cache/files
key: dependencies-laravel-${{ matrix.laravel }}-php-${{ matrix.php }}-composer-${{ hashFiles('composer.json') }}
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
build
composer.lock
vendor
.phpunit.result.cache
42 changes: 42 additions & 0 deletions .php_cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
<?php

$finder = Symfony\Component\Finder\Finder::create()
->notPath('bootstrap/*')
->notPath('storage/*')
->notPath('vendor')
->in([
__DIR__ . '/src',
__DIR__ . '/tests',
])
->name('*.php')
->notName('*.blade.php')
->ignoreDotFiles(true)
->ignoreVCS(true);

return PhpCsFixer\Config::create()
->setRules([
'@PSR2' => true,
'array_syntax' => ['syntax' => 'short'],
'ordered_imports' => ['sortAlgorithm' => 'alpha'],
'no_unused_imports' => true,
'not_operator_with_successor_space' => true,
'trailing_comma_in_multiline_array' => true,
'phpdoc_scalar' => true,
'unary_operator_spaces' => true,
'binary_operator_spaces' => true,
'blank_line_before_statement' => [
'statements' => ['break', 'continue', 'declare', 'return', 'throw', 'try'],
],
'phpdoc_single_line_var_spacing' => true,
'phpdoc_var_without_name' => true,
'class_attributes_separation' => [
'elements' => [
'method', 'property',
],
],
'method_argument_space' => [
'on_multiline' => 'ensure_fully_multiline',
'keep_multiple_spaces_after_comma' => true,
]
])
->setFinder($finder);
1 change: 1 addition & 0 deletions .php_cs.cache
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"php":"7.4.10","version":"2.16.4","indent":" ","lineEnding":"\n","rules":{"blank_line_after_namespace":true,"braces":true,"class_definition":true,"constant_case":true,"elseif":true,"function_declaration":true,"indentation_type":true,"line_ending":true,"lowercase_keywords":true,"method_argument_space":{"on_multiline":"ensure_fully_multiline","keep_multiple_spaces_after_comma":true},"no_break_comment":true,"no_closing_tag":true,"no_spaces_after_function_name":true,"no_spaces_inside_parenthesis":true,"no_trailing_whitespace":true,"no_trailing_whitespace_in_comment":true,"single_blank_line_at_eof":true,"single_class_element_per_statement":{"elements":["property"]},"single_import_per_statement":true,"single_line_after_imports":true,"switch_case_semicolon_to_colon":true,"switch_case_space":true,"visibility_required":true,"encoding":true,"full_opening_tag":true,"array_syntax":{"syntax":"short"},"ordered_imports":{"sortAlgorithm":"alpha"},"no_unused_imports":true,"not_operator_with_successor_space":true,"trailing_comma_in_multiline_array":true,"phpdoc_scalar":true,"unary_operator_spaces":true,"binary_operator_spaces":true,"blank_line_before_statement":{"statements":["break","continue","declare","return","throw","try"]},"phpdoc_single_line_var_spacing":true,"phpdoc_var_without_name":true,"class_attributes_separation":{"elements":["method","property"]}},"hashes":{"src\/Exceptions\/InvalidUrl.php":3578513628,"src\/Exceptions\/UrlNotFoundByIndex.php":3684066567,"src\/Exceptions\/InvalidCrawlRequestHandler.php":3329568979,"src\/CrawlerRobots.php":1283784978,"src\/CrawlProfiles\/CrawlProfile.php":3952254936,"src\/CrawlProfiles\/CrawlAllUrls.php":1687893686,"src\/CrawlProfiles\/CrawlSubdomains.php":386442931,"src\/CrawlProfiles\/CrawlInternalUrls.php":531411513,"src\/LinkAdder.php":4183233950,"src\/ResponseWithCachedBody.php":2242088627,"src\/CrawlObservers\/CrawlObserverCollection.php":1938268009,"src\/CrawlObservers\/CrawlObserver.php":3313776943,"src\/CrawlUrl.php":140071972,"src\/Crawler.php":463124883,"src\/Handlers\/CrawlRequestFulfilled.php":453577302,"src\/Handlers\/CrawlRequestFailed.php":3838322502,"src\/CrawlQueues\/ArrayCrawlQueue.php":1668659547,"src\/CrawlQueues\/CrawlQueue.php":1118419410,"tests\/CrawlerRobotsTest.php":1132888709,"tests\/CrawlObserverCollectionTest.php":1172257760,"tests\/ArrayCrawlQueueTest.php":2293223579,"tests\/TestClasses\/CrawlLogger.php":2089171280,"tests\/TestCase.php":950285644,"tests\/CrawlerTest.php":3083274183}}
18 changes: 0 additions & 18 deletions .scrutinizer.yml

This file was deleted.

4 changes: 0 additions & 4 deletions .styleci.yml

This file was deleted.

9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@

All notable changes to `spatie/crawler` will be documented in this file.

## 5.0.0 - 2020-09-29

- improve chucked reading of response
- move observer / profiles / queues to separate namespaces
- typehint all the things
- use laravel/collections instead of tightenco package
- remove support for anything below PHP 7.4
- remove all deprecated functions and classes

## 4.7.5 - 2020-09-12

- treat connection exceptions as request exceptions
Expand Down
20 changes: 9 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,14 @@

[![Latest Version on Packagist](https://img.shields.io/packagist/v/spatie/crawler.svg?style=flat-square)](https://packagist.org/packages/spatie/crawler)
[![MIT Licensed](https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat-square)](LICENSE.md)
![run-tests](https://github.com/spatie/crawler/workflows/run-tests/badge.svg)
[![StyleCI](https://styleci.io/repos/45406338/shield)](https://styleci.io/repos/45406338)
![Tests](https://github.com/spatie/crawler/workflows/Tests/badge.svg)
![Check & fix styling](https://github.com/spatie/crawler/workflows/Code%20style/badge.svg)
[![Total Downloads](https://img.shields.io/packagist/dt/spatie/crawler.svg?style=flat-square)](https://packagist.org/packages/spatie/crawler)

This package provides a class to crawl links on a website. Under the hood Guzzle promises are used to [crawl multiple urls concurrently](http://docs.guzzlephp.org/en/latest/quickstart.html?highlight=pool#concurrent-requests).

Because the crawler can execute JavaScript, it can crawl JavaScript rendered sites. Under the hood [Chrome and Puppeteer](https://github.com/spatie/browsershot) are used to power this feature.

Spatie is a webdesign agency in Antwerp, Belgium. You'll find an overview of all our open source projects [on our website](https://spatie.be/opensource).

## Support us

Learn how to create a package like this one, by watching our premium video course:
Expand All @@ -38,7 +36,7 @@ The crawler can be instantiated like this
use Spatie\Crawler\Crawler;

Crawler::create()
->setCrawlObserver(<class that extends \Spatie\Crawler\CrawlObserver>)
->setCrawlObserver(<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>)
->startCrawling($url);
```

Expand Down Expand Up @@ -105,8 +103,8 @@ You can set multiple observers with `setCrawlObservers`:
```php
Crawler::create()
->setCrawlObservers([
<class that extends \Spatie\Crawler\CrawlObserver>,
<class that extends \Spatie\Crawler\CrawlObserver>,
<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>,
<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>,
...
])
->startCrawling($url);
Expand All @@ -116,9 +114,9 @@ Alternatively you can set multiple observers one by one with `addCrawlObserver`:

```php
Crawler::create()
->addCrawlObserver(<class that extends \Spatie\Crawler\CrawlObserver>)
->addCrawlObserver(<class that extends \Spatie\Crawler\CrawlObserver>)
->addCrawlObserver(<class that extends \Spatie\Crawler\CrawlObserver>)
->addCrawlObserver(<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>)
->addCrawlObserver(<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>)
->addCrawlObserver(<class that extends \Spatie\Crawler\CrawlObservers\CrawlObserver>)
->startCrawling($url);
```

Expand Down Expand Up @@ -315,7 +313,7 @@ node server.js

With the server running, you can start testing.
```bash
vendor/bin/phpunit
composer tests
```

## Security
Expand Down
11 changes: 5 additions & 6 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,17 @@
}
],
"require": {
"php": "^7.1",
"guzzlehttp/guzzle": "^6.3 || ^7.0",
"php": "^7.4",
"guzzlehttp/guzzle": "^6.3|^7.0",
"guzzlehttp/psr7": "^1.4",
"illuminate/collections": "^8.6",
"nicmart/tree": "^0.3.0",
"spatie/browsershot": "^3.14",
"spatie/robots-txt": "^1.0.1",
"symfony/dom-crawler": "^4.0 || ^5.0",
"tightenco/collect": "^5.6 || ^6.0 || ^7.0"
"symfony/dom-crawler": "^5.0"
},
"require-dev": {
"larapack/dd": "^1.1",
"phpunit/phpunit": "^7.0"
"phpunit/phpunit": "^9.3"
},
"config": {
"sort-packages": true
Expand Down
31 changes: 11 additions & 20 deletions phpunit.xml.dist
Original file line number Diff line number Diff line change
@@ -1,22 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<phpunit bootstrap="vendor/autoload.php"
backupGlobals="false"
backupStaticAttributes="false"
colors="true"
verbose="true"
convertErrorsToExceptions="true"
convertNoticesToExceptions="true"
convertWarningsToExceptions="true"
processIsolation="false"
stopOnFailure="false">
<testsuites>
<testsuite name="League Test Suite">
<directory>tests</directory>
</testsuite>
</testsuites>
<filter>
<whitelist>
<directory suffix=".php">src/</directory>
</whitelist>
</filter>
<phpunit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" bootstrap="vendor/autoload.php" backupGlobals="false" backupStaticAttributes="false" colors="true" verbose="true" convertErrorsToExceptions="true" convertNoticesToExceptions="true" convertWarningsToExceptions="true" processIsolation="false" stopOnFailure="false" xsi:noNamespaceSchemaLocation="https://schema.phpunit.de/9.3/phpunit.xsd">
<coverage>
<include>
<directory suffix=".php">src/</directory>
</include>
</coverage>
<testsuites>
<testsuite name="League Test Suite">
<directory>tests</directory>
</testsuite>
</testsuites>
</phpunit>
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?php

namespace Spatie\Crawler;
namespace Spatie\Crawler\CrawlObservers;

use GuzzleHttp\Exception\RequestException;
use Psr\Http\Message\ResponseInterface;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
<?php

namespace Spatie\Crawler;
namespace Spatie\Crawler\CrawlObservers;

use ArrayAccess;
use GuzzleHttp\Exception\RequestException;
use Iterator;
use Psr\Http\Message\ResponseInterface;
use Spatie\Crawler\CrawlUrl;

class CrawlObserverCollection implements ArrayAccess, Iterator
{
/** @var \Spatie\Crawler\CrawlObserver[] */
protected $observers;
/** @var \Spatie\Crawler\CrawlObservers\CrawlObserver[] */
protected array $observers;

/** @var int */
protected $position;
protected int $position;

public function __construct(array $observers = [])
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?php

namespace Spatie\Crawler;
namespace Spatie\Crawler\CrawlProfiles;

use Psr\Http\Message\UriInterface;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?php

namespace Spatie\Crawler;
namespace Spatie\Crawler\CrawlProfiles;

use GuzzleHttp\Psr7\Uri;
use Psr\Http\Message\UriInterface;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?php

namespace Spatie\Crawler;
namespace Spatie\Crawler\CrawlProfiles;

use Psr\Http\Message\UriInterface;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?php

namespace Spatie\Crawler;
namespace Spatie\Crawler\CrawlProfiles;

use GuzzleHttp\Psr7\Uri;
use Psr\Http\Message\UriInterface;
Expand Down
Loading

0 comments on commit 279b283

Please sign in to comment.