File: //home/retile.ru/public_html/system/library/d_robots_txt_parser/robotstxtvalidator.php
<?php
/**
* Class RobotsTxtValidator
* Class designed to check is url allowed or not to crawl by specific user-agent according to robots.txt rules.
*/
namespace d_robots_txt_parser;
use Exception;
class RobotsTxtValidator
{
/**
* @var array Data with ordered rules to determine isUrl Allow/Disallow
*/
private $orderedDirectivesCache;
/**
* @var array All rules from RobotsTxtParser
*/
private $rules;
/**
* RobotsTxtValidator constructor
*
* @param array $rules Array of all rules from class RobotsTxtParser
*/
public function __construct(array $rules)
{
$this->rules = $rules;
}
/**
* Return true if url is allow to crawl by robots.txt rules otherwise false
*
* @param string $url
* @param string $userAgent
* @return bool
*/
public function isUrlAllow($url, $userAgent = '*')
{
$relativeUrl = $this->getRelativeUrl($url);
$orderedDirectives = $this->getOrderedDirectivesByUserAgent($userAgent);
// if has not allow rules we can determine when url disallowed even on one coincidence - just to do it faster.
$hasAllowDirectives = true;
foreach ($orderedDirectives as $directiveRow) {
if ($directiveRow['directive'] == 'allow' ) {
$hasAllowDirectives = true;
break;
}
}
$isAllow = true;
foreach ($orderedDirectives as $directiveRow) {
if (!in_array($directiveRow['directive'], array('allow', 'disallow'))) {
continue;
}
if (preg_match($directiveRow['rule_regexp'], $relativeUrl)) {
if ($directiveRow['directive'] == 'allow' ) {
$isAllow = true;
}
else {
if (!$hasAllowDirectives) {
return false;
}
$isAllow = false;
}
}
}
return $isAllow;
}
/**
* Return true if url is disallow to crawl by robots.txt rules otherwise false
*
* @param string $url
* @param string $userAgent
* @return bool
*/
public function isUrlDisallow($url, $userAgent = '*')
{
return !$this->isUrlAllow($url, $userAgent);
}
/**
* Get array of ordered by length rules from allow and disallow directives by specific user-agent
* If you have already stored robots.txt rules into database, you can use query like this to fetch ordered rules:
* mysql> SELECT directive,value FROM robots_txt where site_id = ?d and directive IN ('allow','disallow) AND user_agent = ? ORDER BY CHAR_LENGTH(value) ASC;
*
* @param string $userAgent
* @return array
*/
private function getOrderedDirectivesByUserAgent($userAgent)
{
if (!isset($this->orderedDirectivesCache[$userAgent])) {
if (!empty($this->rules[$userAgent])) {
//put data to execution cache
$this->orderedDirectivesCache[$userAgent] = $this->orderDirectives($this->rules[$userAgent]);
}
else {
$this->orderedDirectivesCache[$userAgent] = array();
}
}
return $this->orderedDirectivesCache[$userAgent];
}
/**
* Order directives by rule char length
*
* @param array $rules
* @return array $directives
*/
private function orderDirectives(array $rules)
{
$directives = array();
$allowRules = !empty($rules['allow']) ? $rules['allow'] : array();
$disallowRules = !empty($rules['disallow']) ? $rules['disallow'] : array();
foreach ($allowRules as $rule) {
$directives[] = array(
'directive' => 'allow',
'rule' => $rule,
'rule_regexp' => $this->prepareRegexpRule($rule),
);
}
foreach ($disallowRules as $rule) {
$directives[] = array(
'directive' => 'disallow',
'rule' => $rule,
'rule_regexp' => $this->prepareRegexpRule($rule),
);
}
usort($directives, function ($row1, $row2) {
return mb_strlen($row1['rule']) > mb_strlen($row2['rule']);
});
return $directives;
}
/**
* Always returns relative url without domain which start from "/", e.g.:
*
* http://example.com/test -> /test
* https://example.com/test/path -> /test/path
* /test/any/path -> /test/any/path
* http://example.com -> /
* / -> /
* /some/path -> /some/path
*
* @param string $url
* @return string
* @throws Exception
*/
private function getRelativeUrl($url)
{
try {
if (!$url) {
throw new Exception('Url should not be empty');
}
if (!preg_match('!^https?://!i', $url)) {
if (empty($url[0]) || $url[0] !== '/') {
throw new Exception('Url should start from "/" or has protocol with domain, got ' . $url);
}
else {
return $url;
}
}
} catch (Exception $e) {
var_dump($e->getMessage());
}
return parse_url($url, PHP_URL_PATH);
}
/**
* Convert robots.txt rule to php RegExp
*
* @param string $ruleValue
* @return string
*/
private static function prepareRegexpRule($ruleValue)
{
$replacements = array(
'/\$./' => '\$',
'/\?/' => '\?',
'/\./' => '\.',
'/\*/' => '.*',
);
$ruleValue = preg_replace(array_keys($replacements), array_values($replacements), $ruleValue);
$regexp = '/^' . str_replace('/', '\/', $ruleValue) . '/';
return str_replace('\\\\/', '\/', $regexp);
}
}