zenfolio-scraper/main.php

135 lines
3.8 KiB
PHP
Raw Normal View History

2018-12-22 21:12:27 +01:00
<?php
function getPaginatedUrl(int $photoSetId, int $offset, int $pageSize = 80) : string
{
return sprintf(
'https://api.zenfolio.com/api/private/1.7/zfapi.asmx/LoadPhotoSetPhotos'.
'?startingIndex=%d&numberOfPhotos=%d&photosetId=%d',
$offset,
$pageSize,
$photoSetId
);
}
function getPhotosByPhotoSetId(int $photoSetId) : \Generator
{
$page = 0;
$pageSize = 1000;
// Set up curl for fetching
$curl = curl_init();
curl_setopt($curl, CURLOPT_USERAGENT, 'curl/7.62.0');
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
while (true) {
// Set URL to fetch
curl_setopt($curl, CURLOPT_URL, getPaginatedUrl($photoSetId, $page * $pageSize, $pageSize));
// Fetch url
$result = curl_exec($curl);
// Parse XML
$sxe = simplexml_load_string($result);
// If no photos, break the loop
if (count($sxe->Photo) === 0) {
break;
}
// Loop back photos
foreach ($sxe->Photo as $photo) {
// SimpleXML to stdClass
yield json_decode(json_encode($photo));
}
// If less photos than pageSize, break the loop to not have to go to an empty page
if (count($sxe->Photo) < $pageSize) {
break;
}
$page++;
}
curl_close($curl);
}
2018-12-23 11:33:05 +01:00
function getPhotoGalleryMetadataByUrl(string $url, string $takenOn = null) : stdClass
{
// Set up a dom document and fetch the url
$doc = new DOMDocument();
$doc->loadHTMLFile($url);
// Assume that the page has a title element, grab the first one and trim it
$pageTitle = $doc->getElementsByTagName('title')->item(0)->textContent;
$pageTitle = trim(str_replace('Sportgardens fotogalleri |', '', $pageTitle));
// Match out date from title
2018-12-23 11:22:35 +01:00
preg_match('#(\d\d\d\d-?\d\d-?\d\d)#', $pageTitle, $matches);
// We got gallery date
$galleryDate = $matches[0];
2018-12-23 11:22:35 +01:00
// Clean the date out of the title
2018-12-23 11:23:12 +01:00
$galleryTitle = str_replace($galleryDate, '', $pageTitle);
$galleryTitle = preg_replace('#\d.+\d#', '', $galleryTitle);
$galleryTitle = preg_replace('#\s-#', '', $galleryTitle);
$galleryTitle = preg_replace('#\s+#', ' ', $galleryTitle);
2018-12-23 11:22:35 +01:00
// If dateformat is without dashes, reformat it
if (strlen($galleryDate) === 8) {
$galleryDate = sprintf(
'%d-%02d-%02d',
substr($galleryDate, 0, 4),
substr($galleryDate, 4, 2),
substr($galleryDate, 6, 2),
);
}
2018-12-23 11:33:05 +01:00
if (empty($galleryDate) && $takenOn) {
$galleryDate = date('Y-m-d', strtotime($takenOn));
}
// Put metadata together
return (object) [
'url' => $url,
'date' => $galleryDate,
2018-12-23 11:23:12 +01:00
'group' => trim($galleryTitle),
];
}
function getPhotoSetUrlsByPhotoSetId(int $photoSetId) : \Generator
{
$photoSetUrls = [];
foreach (getPhotosByPhotoSetId($photoSetId) as $photo) {
$galleryUrl = preg_replace('#/\w+$#', '', $photo->PageUrl);
if (!isset($photoSetUrls[$galleryUrl])) {
2018-12-23 11:33:05 +01:00
$photoSetUrls[$galleryUrl] = getPhotoGalleryMetadataByUrl($galleryUrl, $photo->TakenOn);
yield $photoSetUrls[$galleryUrl];
}
}
}
// Map years to photo gallery ids
$photoGalleryIds = [
2016 => 3742586428944470221,
2017 => 3742586429170821662,
2018 => 3742586428889977272,
];
// Check arguments
if ($argv[1] === '--year' && isset($photoGalleryIds[$argv[2]])) {
// Loop through all galleries
foreach (getPhotoSetUrlsByPhotoSetId($photoGalleryIds[$argv[2]]) as $photoGallery) {
echo $photoGallery->group.' - '.$photoGallery->date.' - '.$photoGallery->url.PHP_EOL;
}
return;
}
// Print help
echo 'Usage: php '.$argv[0].' --year [year]'.PHP_EOL;
echo 'Valid years: '.implode(', ', array_keys($photoGalleryIds)).PHP_EOL;