require jaybizzle/crawler-detect

This commit is contained in:
grandeljay 2022-11-19 16:31:49 +01:00
parent 99dd799165
commit ccf753f0cc
24 changed files with 3606 additions and 4 deletions

View file

@ -4,6 +4,7 @@
"grandel/include-directory": "^0.2.2", "grandel/include-directory": "^0.2.2",
"qferr/mjml-php": "^1.1", "qferr/mjml-php": "^1.1",
"gettext/gettext": "^5.6", "gettext/gettext": "^5.6",
"gettext/translator": "^1.1" "gettext/translator": "^1.1",
"jaybizzle/crawler-detect": "^1.2"
} }
} }

54
composer.lock generated
View file

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically" "This file is @generated automatically"
], ],
"content-hash": "95b279b3d80afe5a3510611c3f13a8be", "content-hash": "5c756734244f4cf01d6362d20503f50d",
"packages": [ "packages": [
{ {
"name": "composer/ca-bundle", "name": "composer/ca-bundle",
@ -764,6 +764,58 @@
], ],
"time": "2022-08-28T14:45:39+00:00" "time": "2022-08-28T14:45:39+00:00"
}, },
{
"name": "jaybizzle/crawler-detect",
"version": "v1.2.112",
"source": {
"type": "git",
"url": "https://github.com/JayBizzle/Crawler-Detect.git",
"reference": "2c555ce35a07a5c1c808cee7d5bb52c41a4c7b2f"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/JayBizzle/Crawler-Detect/zipball/2c555ce35a07a5c1c808cee7d5bb52c41a4c7b2f",
"reference": "2c555ce35a07a5c1c808cee7d5bb52c41a4c7b2f",
"shasum": ""
},
"require": {
"php": ">=5.3.0"
},
"require-dev": {
"phpunit/phpunit": "^4.8|^5.5|^6.5|^9.4"
},
"type": "library",
"autoload": {
"psr-4": {
"Jaybizzle\\CrawlerDetect\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Mark Beech",
"email": "m@rkbee.ch",
"role": "Developer"
}
],
"description": "CrawlerDetect is a PHP class for detecting bots/crawlers/spiders via the user agent",
"homepage": "https://github.com/JayBizzle/Crawler-Detect/",
"keywords": [
"crawler",
"crawler detect",
"crawler detector",
"crawlerdetect",
"php crawler detect"
],
"support": {
"issues": "https://github.com/JayBizzle/Crawler-Detect/issues",
"source": "https://github.com/JayBizzle/Crawler-Detect/tree/v1.2.112"
},
"time": "2022-10-05T21:52:44+00:00"
},
{ {
"name": "ml/iri", "name": "ml/iri",
"version": "1.1.4", "version": "1.1.4",

View file

@ -12,6 +12,7 @@ return array(
'Psr\\Http\\Message\\' => array($vendorDir . '/psr/http-factory/src', $vendorDir . '/psr/http-message/src'), 'Psr\\Http\\Message\\' => array($vendorDir . '/psr/http-factory/src', $vendorDir . '/psr/http-message/src'),
'Psr\\Http\\Client\\' => array($vendorDir . '/psr/http-client/src'), 'Psr\\Http\\Client\\' => array($vendorDir . '/psr/http-client/src'),
'ML\\JsonLD\\' => array($vendorDir . '/ml/json-ld'), 'ML\\JsonLD\\' => array($vendorDir . '/ml/json-ld'),
'Jaybizzle\\CrawlerDetect\\' => array($vendorDir . '/jaybizzle/crawler-detect/src'),
'HtmlParser\\' => array($vendorDir . '/oscarotero/html-parser/src'), 'HtmlParser\\' => array($vendorDir . '/oscarotero/html-parser/src'),
'GuzzleHttp\\Psr7\\' => array($vendorDir . '/guzzlehttp/psr7/src'), 'GuzzleHttp\\Psr7\\' => array($vendorDir . '/guzzlehttp/psr7/src'),
'GuzzleHttp\\Promise\\' => array($vendorDir . '/guzzlehttp/promises/src'), 'GuzzleHttp\\Promise\\' => array($vendorDir . '/guzzlehttp/promises/src'),

View file

@ -34,6 +34,10 @@ class ComposerStaticInit5f3db9fc1d0cf1dd6a77a1d84501b4b1
array ( array (
'ML\\JsonLD\\' => 10, 'ML\\JsonLD\\' => 10,
), ),
'J' =>
array (
'Jaybizzle\\CrawlerDetect\\' => 24,
),
'H' => 'H' =>
array ( array (
'HtmlParser\\' => 11, 'HtmlParser\\' => 11,
@ -83,6 +87,10 @@ class ComposerStaticInit5f3db9fc1d0cf1dd6a77a1d84501b4b1
array ( array (
0 => __DIR__ . '/..' . '/ml/json-ld', 0 => __DIR__ . '/..' . '/ml/json-ld',
), ),
'Jaybizzle\\CrawlerDetect\\' =>
array (
0 => __DIR__ . '/..' . '/jaybizzle/crawler-detect/src',
),
'HtmlParser\\' => 'HtmlParser\\' =>
array ( array (
0 => __DIR__ . '/..' . '/oscarotero/html-parser/src', 0 => __DIR__ . '/..' . '/oscarotero/html-parser/src',

View file

@ -785,6 +785,61 @@
], ],
"install-path": "../guzzlehttp/psr7" "install-path": "../guzzlehttp/psr7"
}, },
{
"name": "jaybizzle/crawler-detect",
"version": "v1.2.112",
"version_normalized": "1.2.112.0",
"source": {
"type": "git",
"url": "https://github.com/JayBizzle/Crawler-Detect.git",
"reference": "2c555ce35a07a5c1c808cee7d5bb52c41a4c7b2f"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/JayBizzle/Crawler-Detect/zipball/2c555ce35a07a5c1c808cee7d5bb52c41a4c7b2f",
"reference": "2c555ce35a07a5c1c808cee7d5bb52c41a4c7b2f",
"shasum": ""
},
"require": {
"php": ">=5.3.0"
},
"require-dev": {
"phpunit/phpunit": "^4.8|^5.5|^6.5|^9.4"
},
"time": "2022-10-05T21:52:44+00:00",
"type": "library",
"installation-source": "dist",
"autoload": {
"psr-4": {
"Jaybizzle\\CrawlerDetect\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Mark Beech",
"email": "m@rkbee.ch",
"role": "Developer"
}
],
"description": "CrawlerDetect is a PHP class for detecting bots/crawlers/spiders via the user agent",
"homepage": "https://github.com/JayBizzle/Crawler-Detect/",
"keywords": [
"crawler",
"crawler detect",
"crawler detector",
"crawlerdetect",
"php crawler detect"
],
"support": {
"issues": "https://github.com/JayBizzle/Crawler-Detect/issues",
"source": "https://github.com/JayBizzle/Crawler-Detect/tree/v1.2.112"
},
"install-path": "../jaybizzle/crawler-detect"
},
{ {
"name": "ml/iri", "name": "ml/iri",
"version": "1.1.4", "version": "1.1.4",

View file

@ -3,7 +3,7 @@
'name' => '__root__', 'name' => '__root__',
'pretty_version' => 'dev-develop', 'pretty_version' => 'dev-develop',
'version' => 'dev-develop', 'version' => 'dev-develop',
'reference' => 'dbac609620f0c637189d7d4806892a77ccef4a87', 'reference' => '99dd799165573cb1f7b4c02b3385a1e690b05855',
'type' => 'library', 'type' => 'library',
'install_path' => __DIR__ . '/../../', 'install_path' => __DIR__ . '/../../',
'aliases' => array(), 'aliases' => array(),
@ -13,7 +13,7 @@
'__root__' => array( '__root__' => array(
'pretty_version' => 'dev-develop', 'pretty_version' => 'dev-develop',
'version' => 'dev-develop', 'version' => 'dev-develop',
'reference' => 'dbac609620f0c637189d7d4806892a77ccef4a87', 'reference' => '99dd799165573cb1f7b4c02b3385a1e690b05855',
'type' => 'library', 'type' => 'library',
'install_path' => __DIR__ . '/../../', 'install_path' => __DIR__ . '/../../',
'aliases' => array(), 'aliases' => array(),
@ -100,6 +100,15 @@
'aliases' => array(), 'aliases' => array(),
'dev_requirement' => false, 'dev_requirement' => false,
), ),
'jaybizzle/crawler-detect' => array(
'pretty_version' => 'v1.2.112',
'version' => '1.2.112.0',
'reference' => '2c555ce35a07a5c1c808cee7d5bb52c41a4c7b2f',
'type' => 'library',
'install_path' => __DIR__ . '/../jaybizzle/crawler-detect',
'aliases' => array(),
'dev_requirement' => false,
),
'ml/iri' => array( 'ml/iri' => array(
'pretty_version' => '1.1.4', 'pretty_version' => '1.1.4',
'version' => '1.1.4.0', 'version' => '1.1.4.0',

View file

@ -0,0 +1,23 @@
name: Check & fix styling
on: [ push ]
jobs:
php-cs-fixer:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v2
with:
ref: ${{ github.head_ref }}
- name: Run PHP CS Fixer
uses: docker://oskarstark/php-cs-fixer-ga:2.18.6
with:
args: --config=.php_cs.dist --allow-risky=yes
- name: Commit changes
uses: stefanzweifel/git-auto-commit-action@v4
with:
commit_message: Fix styling

View file

@ -0,0 +1,54 @@
name: Test
on:
push:
branches:
- "master"
pull_request:
jobs:
build:
runs-on: ubuntu-latest
strategy:
fail-fast: true
matrix:
php: [5.3, 5.4, 5.5, 5.6, 7.0, 7.1, 7.2, 7.3, 7.4, 8.0, 8.1]
name: PHP:${{ matrix.php }}
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Setup PHP, with composer
uses: shivammathur/setup-php@v2
with:
php-version: ${{ matrix.php }}
tools: composer:v2
coverage: xdebug
- name: Get composer cache directory
id: composer-cache
run: echo "::set-output name=dir::$(composer config cache-files-dir)"
- name: Cache composer dependencies
uses: actions/cache@v2
with:
path: ${{ steps.composer-cache.outputs.dir }}
key: dependencies-php-${{ matrix.php }}-composer-${{ hashFiles('composer.json') }}
restore-keys: dependencies-php-${{ matrix.php }}-composer-
- name: Install Composer dependencies
run: |
composer install --prefer-dist --no-interaction --no-suggest
- name: Run Unit tests
run: |
vendor/bin/phpunit --coverage-clover=tests/logs/clover.xml
- name: Upload coverage results to Coveralls
env:
COVERALLS_REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
composer global require php-coveralls/php-coveralls "^1.0"
coveralls --coverage_clover=tests/logs/clover.xml -v

View file

@ -0,0 +1,33 @@
<?php
$finder = Symfony\Component\Finder\Finder::create()
->in([
__DIR__.'/src',
__DIR__.'/tests',
])
->name('*.php')
->ignoreDotFiles(true)
->ignoreVCS(true);
return PhpCsFixer\Config::create()
->setRules([
'@PSR2' => true,
'array_syntax' => ['syntax' => 'long'],
'ordered_imports' => ['sortAlgorithm' => 'alpha'],
'no_unused_imports' => true,
'not_operator_with_successor_space' => true,
'trailing_comma_in_multiline_array' => true,
'phpdoc_scalar' => true,
'unary_operator_spaces' => true,
'binary_operator_spaces' => true,
'blank_line_before_statement' => [
'statements' => ['break', 'continue', 'declare', 'return', 'throw', 'try'],
],
'phpdoc_single_line_var_spacing' => true,
'phpdoc_var_without_name' => true,
'method_argument_space' => [
'on_multiline' => 'ensure_fully_multiline',
'keep_multiple_spaces_after_comma' => true,
],
])
->setFinder($finder);

22
vendor/jaybizzle/crawler-detect/LICENSE vendored Normal file
View file

@ -0,0 +1,22 @@
The MIT License (MIT)
Copyright (c) 2015-2020 Mark Beech
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View file

@ -0,0 +1,76 @@
<p align="center"><a href="https://crawlerdetect.io/" target="_blank"><img src="https://cloud.githubusercontent.com/assets/340752/23082173/1bd1a396-f550-11e6-8aba-4d3c75edea2f.png" width="321" height="219" /></a><br><br>
<a href="https://crawlerdetect.io/" target="_blank">crawlerdetect.io</a>
<br><br>
</p>
<p align="center">
<a href="https://github.com/JayBizzle/Crawler-Detect/actions"><img alt="GitHub Workflow Status" src="https://img.shields.io/github/workflow/status/JayBizzle/Crawler-Detect/Test?style=flat-square"></a>
<a href="https://packagist.org/packages/jaybizzle/crawler-detect"><img src="https://img.shields.io/packagist/dm/JayBizzle/Crawler-Detect.svg?style=flat-square" /></a>
<a href="https://github.com/JayBizzle/Crawler-Detect"><img src="https://img.shields.io/badge/license-MIT-ff69b4.svg?style=flat-square" /></a>
<a href="https://packagist.org/packages/jaybizzle/crawler-detect"><img src="https://img.shields.io/packagist/v/jaybizzle/Crawler-Detect.svg?style=flat-square" /></a>
<a href="https://coveralls.io/github/JayBizzle/Crawler-Detect"><img src="https://img.shields.io/coveralls/JayBizzle/Crawler-Detect/master.svg?style=flat-square" /></a>
</p>
## About CrawlerDetect
CrawlerDetect is a PHP class for detecting bots/crawlers/spiders via the `user agent` and `http_from` header. Currently able to detect 1,000's of bots/spiders/crawlers.
### Installation
```
composer require jaybizzle/crawler-detect
```
### Usage
```PHP
use Jaybizzle\CrawlerDetect\CrawlerDetect;
$CrawlerDetect = new CrawlerDetect;
// Check the user agent of the current 'visitor'
if($CrawlerDetect->isCrawler()) {
// true if crawler user agent detected
}
// Pass a user agent as a string
if($CrawlerDetect->isCrawler('Mozilla/5.0 (compatible; Sosospider/2.0; +http://help.soso.com/webspider.htm)')) {
// true if crawler user agent detected
}
// Output the name of the bot that matched (if any)
echo $CrawlerDetect->getMatches();
```
### Contributing
If you find a bot/spider/crawler user agent that CrawlerDetect fails to detect, please submit a pull request with the regex pattern added to the `$data` array in `Fixtures/Crawlers.php` and add the failing user agent to `tests/crawlers.txt`.
Failing that, just create an issue with the user agent you have found, and we'll take it from there :)
### Laravel Package
If you would like to use this with Laravel, please see [Laravel-Crawler-Detect](https://github.com/JayBizzle/Laravel-Crawler-Detect)
### Symfony Bundle
To use this library with Symfony 2/3/4, check out the [CrawlerDetectBundle](https://github.com/nicolasmure/CrawlerDetectBundle).
### YII2 Extension
To use this library with the YII2 framework, check out [yii2-crawler-detect](https://github.com/AlikDex/yii2-crawler-detect).
### ES6 Library
To use this library with NodeJS or any ES6 application based, check out [es6-crawler-detect](https://github.com/JefferyHus/es6-crawler-detect).
### Python Library
To use this library in a Python project, check out [crawlerdetect](https://github.com/moskrc/CrawlerDetect).
### JVM Library (written in Java)
To use this library in a JVM project (including Java, Scala, Kotlin, etc.), check out [CrawlerDetect](https://github.com/nekosoftllc/crawler-detect).
### .NET Library
To use this library in a .net standard (including .net core) based project, check out [NetCrawlerDetect](https://github.com/gplumb/NetCrawlerDetect).
### Ruby Gem
To use this library with Ruby on Rails or any Ruby-based application, check out [crawler_detect](https://github.com/loadkpi/crawler_detect) gem.
### Go Module
To use this library with Go, check out the [crawlerdetect](https://github.com/x-way/crawlerdetect) module.
_Parts of this class are based on the brilliant [MobileDetect](https://github.com/serbanghita/Mobile-Detect)_
[![Analytics](https://ga-beacon.appspot.com/UA-72430465-1/Crawler-Detect/readme?pixel)](https://github.com/JayBizzle/Crawler-Detect)

View file

@ -0,0 +1,29 @@
{
"name": "jaybizzle/crawler-detect",
"type": "library",
"description": "CrawlerDetect is a PHP class for detecting bots/crawlers/spiders via the user agent",
"keywords": ["crawler", "crawler detect", "crawler detector", "crawlerdetect", "php crawler detect"],
"homepage": "https://github.com/JayBizzle/Crawler-Detect/",
"license": "MIT",
"authors": [
{
"name": "Mark Beech",
"email": "m@rkbee.ch",
"role": "Developer"
}
],
"require": {
"php": ">=5.3.0"
},
"require-dev": {
"phpunit/phpunit": "^4.8|^5.5|^6.5|^9.4"
},
"autoload": {
"psr-4": {
"Jaybizzle\\CrawlerDetect\\": "src/"
}
},
"scripts": {
"test": "vendor/bin/phpunit"
}
}

View file

@ -0,0 +1,41 @@
<?php
/*
* This file is part of Crawler Detect - the web crawler detection library.
*
* (c) Mark Beech <m@rkbee.ch>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
require 'src/Fixtures/AbstractProvider.php';
require 'src/Fixtures/Crawlers.php';
require 'src/Fixtures/Exclusions.php';
require 'src/Fixtures/Headers.php';
$src = array(
'Crawlers',
'Exclusions',
'Headers',
);
foreach ($src as $class) {
$class = "Jaybizzle\\CrawlerDetect\\Fixtures\\$class";
$object = new $class;
outputJson($object);
outputTxt($object);
}
function outputJson($object)
{
$className = (new ReflectionClass($object))->getShortName();
file_put_contents("raw/$className.json", json_encode($object->getAll()));
}
function outputTxt($object)
{
$className = (new ReflectionClass($object))->getShortName();
file_put_contents("raw/$className.txt", implode(PHP_EOL, $object->getAll()));
}

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1 @@
["Safari.[\\d\\.]*","Firefox.[\\d\\.]*"," Chrome.[\\d\\.]*","Chromium.[\\d\\.]*","MSIE.[\\d\\.]","Opera\\\/[\\d\\.]*","Mozilla.[\\d\\.]*","AppleWebKit.[\\d\\.]*","Trident.[\\d\\.]*","Windows NT.[\\d\\.]*","Android [\\d\\.]*","Macintosh.","Ubuntu","Linux","[ ]Intel","Mac OS X [\\d_]*","(like )?Gecko(.[\\d\\.]*)?","KHTML,","CriOS.[\\d\\.]*","CPU iPhone OS ([0-9_])* like Mac OS X","CPU OS ([0-9_])* like Mac OS X","iPod","compatible","x86_..","i686","x64","X11","rv:[\\d\\.]*","Version.[\\d\\.]*","WOW64","Win64","Dalvik.[\\d\\.]*"," \\.NET CLR [\\d\\.]*","Presto.[\\d\\.]*","Media Center PC","BlackBerry","Build","Opera Mini\\\/\\d{1,2}\\.\\d{1,2}\\.[\\d\\.]*\\\/\\d{1,2}\\.","Opera"," \\.NET[\\d\\.]*","cubot","; M bot","; CRONO","; B bot","; IDbot","; ID bot","; POWER BOT","OCTOPUS-CORE"]

View file

@ -0,0 +1,48 @@
Safari.[\d\.]*
Firefox.[\d\.]*
Chrome.[\d\.]*
Chromium.[\d\.]*
MSIE.[\d\.]
Opera\/[\d\.]*
Mozilla.[\d\.]*
AppleWebKit.[\d\.]*
Trident.[\d\.]*
Windows NT.[\d\.]*
Android [\d\.]*
Macintosh.
Ubuntu
Linux
[ ]Intel
Mac OS X [\d_]*
(like )?Gecko(.[\d\.]*)?
KHTML,
CriOS.[\d\.]*
CPU iPhone OS ([0-9_])* like Mac OS X
CPU OS ([0-9_])* like Mac OS X
iPod
compatible
x86_..
i686
x64
X11
rv:[\d\.]*
Version.[\d\.]*
WOW64
Win64
Dalvik.[\d\.]*
\.NET CLR [\d\.]*
Presto.[\d\.]*
Media Center PC
BlackBerry
Build
Opera Mini\/\d{1,2}\.\d{1,2}\.[\d\.]*\/\d{1,2}\.
Opera
\.NET[\d\.]*
cubot
; M bot
; CRONO
; B bot
; IDbot
; ID bot
; POWER BOT
OCTOPUS-CORE

View file

@ -0,0 +1 @@
["HTTP_USER_AGENT","HTTP_X_OPERAMINI_PHONE_UA","HTTP_X_DEVICE_USER_AGENT","HTTP_X_ORIGINAL_USER_AGENT","HTTP_X_SKYFIRE_PHONE","HTTP_X_BOLT_PHONE_UA","HTTP_DEVICE_STOCK_UA","HTTP_X_UCBROWSER_DEVICE_UA","HTTP_FROM","HTTP_X_SCANNER"]

View file

@ -0,0 +1,10 @@
HTTP_USER_AGENT
HTTP_X_OPERAMINI_PHONE_UA
HTTP_X_DEVICE_USER_AGENT
HTTP_X_ORIGINAL_USER_AGENT
HTTP_X_SKYFIRE_PHONE
HTTP_X_BOLT_PHONE_UA
HTTP_DEVICE_STOCK_UA
HTTP_X_UCBROWSER_DEVICE_UA
HTTP_FROM
HTTP_X_SCANNER

View file

@ -0,0 +1,187 @@
<?php
/*
* This file is part of Crawler Detect - the web crawler detection library.
*
* (c) Mark Beech <m@rkbee.ch>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
namespace Jaybizzle\CrawlerDetect;
use Jaybizzle\CrawlerDetect\Fixtures\Crawlers;
use Jaybizzle\CrawlerDetect\Fixtures\Exclusions;
use Jaybizzle\CrawlerDetect\Fixtures\Headers;
class CrawlerDetect
{
/**
* The user agent.
*
* @var string|null
*/
protected $userAgent;
/**
* Headers that contain a user agent.
*
* @var array
*/
protected $httpHeaders = array();
/**
* Store regex matches.
*
* @var array
*/
protected $matches = array();
/**
* Crawlers object.
*
* @var \Jaybizzle\CrawlerDetect\Fixtures\Crawlers
*/
protected $crawlers;
/**
* Exclusions object.
*
* @var \Jaybizzle\CrawlerDetect\Fixtures\Exclusions
*/
protected $exclusions;
/**
* Headers object.
*
* @var \Jaybizzle\CrawlerDetect\Fixtures\Headers
*/
protected $uaHttpHeaders;
/**
* The compiled regex string.
*
* @var string
*/
protected $compiledRegex;
/**
* The compiled exclusions regex string.
*
* @var string
*/
protected $compiledExclusions;
/**
* Class constructor.
*/
public function __construct(array $headers = null, $userAgent = null)
{
$this->crawlers = new Crawlers();
$this->exclusions = new Exclusions();
$this->uaHttpHeaders = new Headers();
$this->compiledRegex = $this->compileRegex($this->crawlers->getAll());
$this->compiledExclusions = $this->compileRegex($this->exclusions->getAll());
$this->setHttpHeaders($headers);
$this->setUserAgent($userAgent);
}
/**
* Compile the regex patterns into one regex string.
*
* @param array
*
* @return string
*/
public function compileRegex($patterns)
{
return '('.implode('|', $patterns).')';
}
/**
* Set HTTP headers.
*
* @param array|null $httpHeaders
*/
public function setHttpHeaders($httpHeaders)
{
// Use global _SERVER if $httpHeaders aren't defined.
if (! is_array($httpHeaders) || ! count($httpHeaders)) {
$httpHeaders = $_SERVER;
}
// Clear existing headers.
$this->httpHeaders = array();
// Only save HTTP headers. In PHP land, that means
// only _SERVER vars that start with HTTP_.
foreach ($httpHeaders as $key => $value) {
if (strpos($key, 'HTTP_') === 0) {
$this->httpHeaders[$key] = $value;
}
}
}
/**
* Return user agent headers.
*
* @return array
*/
public function getUaHttpHeaders()
{
return $this->uaHttpHeaders->getAll();
}
/**
* Set the user agent.
*
* @param string|null $userAgent
*/
public function setUserAgent($userAgent)
{
if (is_null($userAgent)) {
foreach ($this->getUaHttpHeaders() as $altHeader) {
if (isset($this->httpHeaders[$altHeader])) {
$userAgent .= $this->httpHeaders[$altHeader].' ';
}
}
}
return $this->userAgent = $userAgent;
}
/**
* Check user agent string against the regex.
*
* @param string|null $userAgent
*
* @return bool
*/
public function isCrawler($userAgent = null)
{
$agent = trim(preg_replace(
"/{$this->compiledExclusions}/i",
'',
$userAgent ?: $this->userAgent ?: ''
));
if ($agent === '') {
return false;
}
return (bool) preg_match("/{$this->compiledRegex}/i", $agent, $this->matches);
}
/**
* Return the matches.
*
* @return string|null
*/
public function getMatches()
{
return isset($this->matches[0]) ? $this->matches[0] : null;
}
}

View file

@ -0,0 +1,32 @@
<?php
/*
* This file is part of Crawler Detect - the web crawler detection library.
*
* (c) Mark Beech <m@rkbee.ch>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
namespace Jaybizzle\CrawlerDetect\Fixtures;
abstract class AbstractProvider
{
/**
* The data set.
*
* @var array
*/
protected $data;
/**
* Return the data set.
*
* @return array
*/
public function getAll()
{
return $this->data;
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,72 @@
<?php
/*
* This file is part of Crawler Detect - the web crawler detection library.
*
* (c) Mark Beech <m@rkbee.ch>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
namespace Jaybizzle\CrawlerDetect\Fixtures;
class Exclusions extends AbstractProvider
{
/**
* List of strings to remove from the user agent before running the crawler regex
* Over a large list of user agents, this gives us about a 55% speed increase!
*
* @var array
*/
protected $data = array(
'Safari.[\d\.]*',
'Firefox.[\d\.]*',
' Chrome.[\d\.]*',
'Chromium.[\d\.]*',
'MSIE.[\d\.]',
'Opera\/[\d\.]*',
'Mozilla.[\d\.]*',
'AppleWebKit.[\d\.]*',
'Trident.[\d\.]*',
'Windows NT.[\d\.]*',
'Android [\d\.]*',
'Macintosh.',
'Ubuntu',
'Linux',
'[ ]Intel',
'Mac OS X [\d_]*',
'(like )?Gecko(.[\d\.]*)?',
'KHTML,',
'CriOS.[\d\.]*',
'CPU iPhone OS ([0-9_])* like Mac OS X',
'CPU OS ([0-9_])* like Mac OS X',
'iPod',
'compatible',
'x86_..',
'i686',
'x64',
'X11',
'rv:[\d\.]*',
'Version.[\d\.]*',
'WOW64',
'Win64',
'Dalvik.[\d\.]*',
' \.NET CLR [\d\.]*',
'Presto.[\d\.]*',
'Media Center PC',
'BlackBerry',
'Build',
'Opera Mini\/\d{1,2}\.\d{1,2}\.[\d\.]*\/\d{1,2}\.',
'Opera',
' \.NET[\d\.]*',
'cubot',
'; M bot',
'; CRONO',
'; B bot',
'; IDbot',
'; ID bot',
'; POWER BOT',
'OCTOPUS-CORE',
);
}

View file

@ -0,0 +1,37 @@
<?php
/*
* This file is part of Crawler Detect - the web crawler detection library.
*
* (c) Mark Beech <m@rkbee.ch>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
namespace Jaybizzle\CrawlerDetect\Fixtures;
class Headers extends AbstractProvider
{
/**
* All possible HTTP headers that represent the user agent string.
*
* @var array
*/
protected $data = array(
// The default User-Agent string.
'HTTP_USER_AGENT',
// Header can occur on devices using Opera Mini.
'HTTP_X_OPERAMINI_PHONE_UA',
// Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/
'HTTP_X_DEVICE_USER_AGENT',
'HTTP_X_ORIGINAL_USER_AGENT',
'HTTP_X_SKYFIRE_PHONE',
'HTTP_X_BOLT_PHONE_UA',
'HTTP_DEVICE_STOCK_UA',
'HTTP_X_UCBROWSER_DEVICE_UA',
// Sometimes, bots (especially Google) use a genuine user agent, but fill this header in with their email address
'HTTP_FROM',
'HTTP_X_SCANNER', // Seen in use by Netsparker
);
}