* * For the full copyright and license information, please view the LICENSE * file that was distributed with this source code. */ namespace ML\JsonLD; use ML\JsonLD\Exception\JsonLdException; use ML\IRI\IRI; /** * The FileGetContentsLoader loads remote documents by calling file_get_contents * * @author Markus Lanthaler */ class FileGetContentsLoader implements DocumentLoaderInterface { /** * {@inheritdoc} */ public function loadDocument($url) { // if input looks like a file, try to retrieve it $input = trim($url); if (false === (isset($input[0]) && ("{" === $input[0]) || ("[" === $input[0]))) { $remoteDocument = new RemoteDocument($url); $streamContextOptions = array( 'method' => 'GET', 'header' => "Accept: application/ld+json, application/json; q=0.9, */*; q=0.1\r\n" . "User-Agent: lanthaler JsonLD\r\n", 'timeout' => Processor::REMOTE_TIMEOUT ); $context = stream_context_create(array( 'http' => $streamContextOptions, 'https' => $streamContextOptions )); $httpHeadersOffset = 0; stream_context_set_params($context, array('notification' => function ($code, $severity, $msg, $msgCode, $bytesTx, $bytesMax) use ( &$remoteDocument, &$http_response_header, &$httpHeadersOffset ) { if ($code === STREAM_NOTIFY_MIME_TYPE_IS) { $remoteDocument->mediaType = $msg; } elseif ($code === STREAM_NOTIFY_REDIRECTED) { $remoteDocument->documentUrl = $msg; $remoteDocument->mediaType = null; $httpHeadersOffset = isset($http_response_header) ? count($http_response_header) : 0; } } )); if (false === ($input = @file_get_contents($url, false, $context))) { throw new JsonLdException( JsonLdException::LOADING_DOCUMENT_FAILED, sprintf('Unable to load the remote document "%s".', $url), $http_response_header ); } // Extract HTTP Link headers $linkHeaderValues = array(); if (is_array($http_response_header)) { for ($i = count($http_response_header) - 1; $i > $httpHeadersOffset; $i--) { if (0 === substr_compare($http_response_header[$i], 'Link:', 0, 5, true)) { $value = substr($http_response_header[$i], 5); $linkHeaderValues[] = $value; } } } $linkHeaderValues = $this->parseLinkHeaders($linkHeaderValues, new IRI($url)); $contextLinkHeaders = array_filter($linkHeaderValues, function ($link) { return (isset($link['rel']) && in_array('http://www.w3.org/ns/json-ld#context', explode(' ', $link['rel']))); }); if (count($contextLinkHeaders) === 1) { $remoteDocument->contextUrl = $contextLinkHeaders[0]['uri']; } elseif (count($contextLinkHeaders) > 1) { throw new JsonLdException( JsonLdException::MULTIPLE_CONTEXT_LINK_HEADERS, 'Found multiple contexts in HTTP Link headers', $http_response_header ); } // If we got a media type, we verify it if ($remoteDocument->mediaType) { // Drop any media type parameters such as profiles if (false !== ($pos = strpos($remoteDocument->mediaType, ';'))) { $remoteDocument->mediaType = substr($remoteDocument->mediaType, 0, $pos); } $remoteDocument->mediaType = trim($remoteDocument->mediaType); if ('application/ld+json' === $remoteDocument->mediaType) { $remoteDocument->contextUrl = null; } else { // If the Media type was not as expected, check to see if the desired content type // is being offered in a Link header (this is what schema.org now does). $altLinkHeaders = array_filter($linkHeaderValues, function ($link) { return (isset($link['rel']) && isset($link['type']) && ($link['rel'] === 'alternate') && ($link['type'] === 'application/ld+json')); }); // The spec states 'A response MUST NOT contain more than one HTTP Link Header // using the alternate link relation with type="application/ld+json"' if (count($altLinkHeaders) === 1) { return $this->loadDocument($altLinkHeaders[0]['uri']); } elseif (count($altLinkHeaders) > 1) { throw new JsonLdException( JsonLdException::LOADING_DOCUMENT_FAILED, 'Received multiple alternate link headers' ); } if (('application/json' !== $remoteDocument->mediaType) && (0 !== substr_compare($remoteDocument->mediaType, '+json', -5))) { throw new JsonLdException( JsonLdException::LOADING_DOCUMENT_FAILED, 'Invalid media type', $remoteDocument->mediaType ); } } } $remoteDocument->document = Processor::parse($input); return $remoteDocument; } return new RemoteDocument($url, Processor::parse($input)); } /** * Parse HTTP Link headers * * @param array $values An array of HTTP Link headers. * @param IRI $baseIri The document's URL (used to expand relative URLs to absolutes). * * @return array A structured representation of the Link header values. * * @internal Do not use this method directly, it's only temporarily accessible for testing. */ public function parseLinkHeaders(array $values, IRI $baseIri) { // Separate multiple links contained in a single header value for ($i = 0, $total = count($values); $i < $total; $i++) { if (strpos($values[$i], ',') !== false) { foreach (preg_split('/,(?=([^"]*"[^"]*")*[^"]*$)/', $values[$i]) as $v) { $values[] = trim($v); } unset($values[$i]); } } $contexts = $matches = array(); $trimWhitespaceCallback = function ($str) { return trim($str, "\"' \n\t"); }; // Split the header in key-value pairs $result = array(); foreach ($values as $val) { $part = array(); foreach (preg_split('/;(?=([^"]*"[^"]*")*[^"]*$)/', $val) as $kvp) { preg_match_all('/<[^>]+>|[^=]+/', $kvp, $matches); $pieces = array_map($trimWhitespaceCallback, $matches[0]); if (count($pieces) > 1) { $part[$pieces[0]] = $pieces[1]; } elseif (count($pieces) === 1) { $part['uri'] = (string) $baseIri->resolve(trim($pieces[0], '<> ')); } } if (!empty($part)) { $result[] = $part; } } return $result; } }