wishthis/vendor/ml/json-ld/FileGetContentsLoader.php
2022-01-21 09:23:52 +01:00

197 lines
7.7 KiB
PHP

<?php
/*
* (c) Markus Lanthaler <mail@markus-lanthaler.com>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace ML\JsonLD;
use ML\JsonLD\Exception\JsonLdException;
use ML\IRI\IRI;
/**
* The FileGetContentsLoader loads remote documents by calling file_get_contents
*
* @author Markus Lanthaler <mail@markus-lanthaler.com>
*/
class FileGetContentsLoader implements DocumentLoaderInterface
{
/**
* {@inheritdoc}
*/
public function loadDocument($url)
{
// if input looks like a file, try to retrieve it
$input = trim($url);
if (false === (isset($input[0]) && ("{" === $input[0]) || ("[" === $input[0]))) {
$remoteDocument = new RemoteDocument($url);
$streamContextOptions = array(
'method' => 'GET',
'header' => "Accept: application/ld+json, application/json; q=0.9, */*; q=0.1\r\n"
. "User-Agent: lanthaler JsonLD\r\n",
'timeout' => Processor::REMOTE_TIMEOUT
);
$context = stream_context_create(array(
'http' => $streamContextOptions,
'https' => $streamContextOptions
));
$httpHeadersOffset = 0;
stream_context_set_params($context, array('notification' =>
function ($code, $severity, $msg, $msgCode, $bytesTx, $bytesMax) use (
&$remoteDocument, &$http_response_header, &$httpHeadersOffset
) {
if ($code === STREAM_NOTIFY_MIME_TYPE_IS) {
$remoteDocument->mediaType = $msg;
} elseif ($code === STREAM_NOTIFY_REDIRECTED) {
$remoteDocument->documentUrl = $msg;
$remoteDocument->mediaType = null;
$httpHeadersOffset = isset($http_response_header) ? count($http_response_header) : 0;
}
}
));
if (false === ($input = @file_get_contents($url, false, $context))) {
throw new JsonLdException(
JsonLdException::LOADING_DOCUMENT_FAILED,
sprintf('Unable to load the remote document "%s".', $url),
$http_response_header
);
}
// Extract HTTP Link headers
$linkHeaderValues = array();
if (is_array($http_response_header)) {
for ($i = count($http_response_header) - 1; $i > $httpHeadersOffset; $i--) {
if (0 === substr_compare($http_response_header[$i], 'Link:', 0, 5, true)) {
$value = substr($http_response_header[$i], 5);
$linkHeaderValues[] = $value;
}
}
}
$linkHeaderValues = $this->parseLinkHeaders($linkHeaderValues, new IRI($url));
$contextLinkHeaders = array_filter($linkHeaderValues, function ($link) {
return (isset($link['rel'])
&& in_array('http://www.w3.org/ns/json-ld#context', explode(' ', $link['rel'])));
});
if (count($contextLinkHeaders) === 1) {
$remoteDocument->contextUrl = $contextLinkHeaders[0]['uri'];
} elseif (count($contextLinkHeaders) > 1) {
throw new JsonLdException(
JsonLdException::MULTIPLE_CONTEXT_LINK_HEADERS,
'Found multiple contexts in HTTP Link headers',
$http_response_header
);
}
// If we got a media type, we verify it
if ($remoteDocument->mediaType) {
// Drop any media type parameters such as profiles
if (false !== ($pos = strpos($remoteDocument->mediaType, ';'))) {
$remoteDocument->mediaType = substr($remoteDocument->mediaType, 0, $pos);
}
$remoteDocument->mediaType = trim($remoteDocument->mediaType);
if ('application/ld+json' === $remoteDocument->mediaType) {
$remoteDocument->contextUrl = null;
} else {
// If the Media type was not as expected, check to see if the desired content type
// is being offered in a Link header (this is what schema.org now does).
$altLinkHeaders = array_filter($linkHeaderValues, function ($link) {
return (isset($link['rel']) && isset($link['type'])
&& ($link['rel'] === 'alternate') && ($link['type'] === 'application/ld+json'));
});
// The spec states 'A response MUST NOT contain more than one HTTP Link Header
// using the alternate link relation with type="application/ld+json"'
if (count($altLinkHeaders) === 1) {
return $this->loadDocument($altLinkHeaders[0]['uri']);
} elseif (count($altLinkHeaders) > 1) {
throw new JsonLdException(
JsonLdException::LOADING_DOCUMENT_FAILED,
'Received multiple alternate link headers'
);
}
if (('application/json' !== $remoteDocument->mediaType) &&
(0 !== substr_compare($remoteDocument->mediaType, '+json', -5))) {
throw new JsonLdException(
JsonLdException::LOADING_DOCUMENT_FAILED,
'Invalid media type',
$remoteDocument->mediaType
);
}
}
}
$remoteDocument->document = Processor::parse($input);
return $remoteDocument;
}
return new RemoteDocument($url, Processor::parse($input));
}
/**
* Parse HTTP Link headers
*
* @param array $values An array of HTTP Link headers.
* @param IRI $baseIri The document's URL (used to expand relative URLs to absolutes).
*
* @return array A structured representation of the Link header values.
*
* @internal Do not use this method directly, it's only temporarily accessible for testing.
*/
public function parseLinkHeaders(array $values, IRI $baseIri)
{
// Separate multiple links contained in a single header value
for ($i = 0, $total = count($values); $i < $total; $i++) {
if (strpos($values[$i], ',') !== false) {
foreach (preg_split('/,(?=([^"]*"[^"]*")*[^"]*$)/', $values[$i]) as $v) {
$values[] = trim($v);
}
unset($values[$i]);
}
}
$contexts = $matches = array();
$trimWhitespaceCallback = function ($str) {
return trim($str, "\"' \n\t");
};
// Split the header in key-value pairs
$result = array();
foreach ($values as $val) {
$part = array();
foreach (preg_split('/;(?=([^"]*"[^"]*")*[^"]*$)/', $val) as $kvp) {
preg_match_all('/<[^>]+>|[^=]+/', $kvp, $matches);
$pieces = array_map($trimWhitespaceCallback, $matches[0]);
if (count($pieces) > 1) {
$part[$pieces[0]] = $pieces[1];
} elseif (count($pieces) === 1) {
$part['uri'] = (string) $baseIri->resolve(trim($pieces[0], '<> '));
}
}
if (!empty($part)) {
$result[] = $part;
}
}
return $result;
}
}