cachefilebase: $cachefilebase"; #echo "
dirname: $dirname
"; #echo "
URL Path (var): $var
"; #echo "
cachefile: $cachefile
"; #echo "
path for cachefile: $cachefiledir 
"; if ($refresh == "yes"){ unlink($cachefile); unlink("$cachefile.amp"); } if (trim(file_get_contents($cachefile)) == false){ unlink($cachefile);} if ($refresh == "yes"){ unlink($cachefile);} if (file_exists($cachefile)) { $file = file($cachefile); $contents = implode($file); } else { #see if page exists $ch = curl_init("$url"); curl_setopt($ch); curl_setopt($ch, CURLOPT_HEADER, true); // we want headers curl_setopt($ch, CURLOPT_NOBODY, true); // we don't need body curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); curl_setopt($ch, CURLOPT_TIMEOUT,2); curl_setopt($ch, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V4); curl_setopt($ch, CURLOPT_ENCODING, '');//set gzip, deflate or keep empty for server to detect and set supported encoding. curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/70.0.3538.75 Mobile/15E148 Safari/605.1'); $output = curl_exec($ch); $httpcode = curl_getinfo($ch, CURLINFO_HTTP_CODE); #echo $httpcode; #$info = curl_getinfo($ch); #echo 'Header Took ' . $info['total_time'] . ' seconds to send a request to ' . $info['url']; curl_close($ch); if ($httpcode !== 200){ $notfound = "404 Not Found"; $msg = "

$notfound

"; $msg .= "Return Home "; $title = "$title $notfound - "; header($_SERVER["SERVER_PROTOCOL"]." 404 Not Found"); }else{ $curl = curl_init($url); curl_setopt($curl, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V4); curl_setopt($curl, CURLOPT_ENCODING, '');//set gzip, deflate or keep empty for server to detect and set supported encoding. curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE); curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, FALSE); curl_setopt($curl, CURLOPT_TIMEOUT,2); curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/70.0.3538.75 Mobile/15E148 Safari/605.1'); $contents = curl_exec($curl); $info = curl_getinfo($curl); #echo 'Body Took ' . $info['total_time'] . ' seconds to send a request to ' . $info['url']; curl_close($curl); #echo "
httpcode is $httpcode 
"; if (!file_exists($cachefiledir)) { mkdir($cachefiledir, 0777, true);} ###2024 file_put_contents($cachefile, $contents); $dom = new DOMDocument(); libxml_use_internal_errors( 1 ); $dom->loadHTML($contents); $xpath = new DOMXpath( $dom ); $jsonScripts = $xpath->query( '//script[@type="application/ld+json"]' ); $json = trim( $jsonScripts->item(0)->nodeValue ); $data = json_decode($json); $json = json_decode($json, true); $dateModified = $json['dateModified']; $datePublished = $json['datePublished']; $timestamp = strtotime('11/23/2011 10:59 am EST'); $datePublishedlinux = strtotime($datePublished); $dateModifiedlinux = strtotime($dateModified); touch($cachefile, $dateModifiedlinux, $datePublishedlinux); //Save images $doc = new DOMDocument(); @$doc->loadHTML($contents); $tags = $doc->getElementsByTagName('img'); foreach ($tags as $tag) { $imgurl = $tag->getAttribute('src'); #$imgurl .= $tag->getAttribute('srcset'); $pathimg = $imgurl; $filename = basename($imgurl); #$filename = str_replace(" ","", $filename); $filename = strtok($filename, '?'); #echo "filename is $filename"; #echo "
$imgurl
"; $var = parse_url($pathimg,PHP_URL_PATH); $path = parse_url($pathimg, PHP_URL_PATH); #echo $path; $dirnameimage = dirname($path); $dirnameimage = "$dirname/images$dirnameimage"; $filenamewithpath = "$dirnameimage$filename"; #echo "
dirname for dirnameimage: $dirnameimage
"; #echo "
filenamewithpath $filenamewithpath
"; if (!file_exists($filenamewithpath)) { #echo "image $filenamewithpath does not exist going to download it and save it "; $curl = curl_init($imgurl); curl_setopt($curl, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V4); curl_setopt($curl, CURLOPT_ENCODING, '');//set gzip, deflate or keep empty for server to detect and set supported encoding. curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE); curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, FALSE); curl_setopt($curl, CURLOPT_TIMEOUT,2); curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/70.0.3538.75 Mobile/15E148 Safari/605.1'); $image = curl_exec($curl); $info = curl_getinfo($curl); #echo '
Image(s) Took ' . $info['total_time'] . ' seconds to send a request to ' . $info['url']; curl_close($curl); if (!empty($image)){ if (!file_exists($dirnameimage)) { mkdir($dirnameimage, 0777, true); #echo "making dir $dirnameimage"; } ### 2024 file_put_contents($filenamewithpath, $image); #echo "
Saving $filenamewithpath from $imgurl
"; } } } }//filenotfound }//code 200 check //Local Processing #check if amp exists $amp = "$cachefile.amp"; #echo "
amp is $amp
"; $dom = new DOMDocument(); @$dom->loadHTML($contents); $nodes = $dom->getElementsByTagName('link'); foreach ($nodes as $node) { if ($node->getAttribute('rel') === 'amphtml') { $amphtml = ($node->getAttribute('href')); /* amp page found do your logic */ #echo "amphtml found at $amphtml "; if (!file_exists($amp)) { #echo " local not found so getting amp $amp from $amphtml "; $curl = curl_init($amphtml); curl_setopt($curl, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V4); curl_setopt($curl, CURLOPT_ENCODING, '');//set gzip, deflate or keep empty for server to detect and set supported encoding. curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE); curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, FALSE); curl_setopt($curl, CURLOPT_TIMEOUT,2); curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/70.0.3538.75 Mobile/15E148 Safari/605.1'); $contents = curl_exec($curl); $info = curl_getinfo($curl); #echo '
Image(s) Took ' . $info['total_time'] . ' seconds to send a request to ' . $info['url']; curl_close($curl); ### 2024 file_put_contents($amp, $contents); } } } if (file_exists($amp)) { #$contents = file_get_contents($amp); $doc = new DOMDocument(); #libxml_use_internal_errors(true); $doc->loadHTML($contents); $finder = new DomXPath($doc); $node = $finder->query("//*[contains(@class, 'entry__content')]"); $ampnewbodystory = ($doc->saveHTML($node->item(0))); } if (file_exists($amp)) { #$contents = file_get_contents($amp); } $contents = preg_replace('/srcSet=/m',"src2=", $contents); $contents = preg_replace('~href="/~', 'href="/newstempch.php?article=/', $contents); $contents = preg_replace('~href="http://www.huffpost.com/~', 'href="/newstempch.php?article=/', $contents); $contents = preg_replace('~href="https://www.huffpost.com/~', 'href="/newstempch.php?article=/', $contents); $contents = preg_replace('~href="/news/~', 'href="/newstempch.php?article=', $contents); $contents = str_replace('src=\'https://i.huffpost.com',"src='$cdn/huffarticles/images", $contents); $contents = str_replace('src=\'http://i.huffpost.com',"src='$cdn/huffarticles/images", $contents); $contents = str_replace('src="https://i.huffpost.com',"src=\"$cdn/huffarticles/images", $contents); $contents = preg_replace('~https://i.huffpost.com~',"$cdn/huffarticles/images", $contents); $contents = str_replace("| CBC News","", $contents); $contents = preg_replace('/[\x00-\x1F\x7F-\xFF]/', '', $contents); $contents = preg_replace('~https://www.huffpost.com~', 'https://action.news', $contents); $contents = str_replace('', ' ', $contents); $contents = str_replace("<a","<", $contents); $contents = htmlspecialchars_decode($contents); $contents = preg_replace('/
/Usi', "", $contents); $contents = preg_replace('/
loadHTML($contents); $xpath = new DomXPath($dom); $xpath_results = $xpath->query("//div[contains(@class, '$classname')]"); if($div = $xpath_results->item(0)){ //remove the node the same way $div ->parentNode->removeChild($div); $contents = $dom->saveHTML(); #echo $dom->saveHTML(); } $classname = "img-sized__placeholder"; $dom = new DOMDocument(); $dom->loadHTML($contents); $xpath = new DomXPath($dom); $xpath_results = $xpath->query("//div[contains(@class, '$classname')]"); if($div = $xpath_results->item(0)){ //remove the node the same way $div ->parentNode->removeChild($div); $contents = $dom->saveHTML(); #echo $dom->saveHTML(); } $classname = "connatix-wrapper"; $dom = new DOMDocument(); $dom->loadHTML($contents); $xpath = new DomXPath($dom); $xpath_results = $xpath->query("//div[contains(@class, '$classname')]"); if($div = $xpath_results->item(0)){ //remove the node the same way $div ->parentNode->removeChild($div); $contents = $dom->saveHTML(); #echo $dom->saveHTML(); } $doc = new DOMDocument(); @$doc->loadHTML($contents); $nodes = $doc->getElementsByTagName('title'); $title = $nodes->item(0)->nodeValue; $metas = $doc->getElementsByTagName('meta'); for ($i = 0; $i < $metas->length; $i++) { $meta = $metas->item($i); if($meta->getAttribute('name') == 'description') $description = $meta->getAttribute('content'); if($meta->getAttribute('name') == 'keywords') $keywords = $meta->getAttribute('content'); if($meta->getAttribute('property') == 'og:image') $metaimage = $meta->getAttribute('content'); } if (empty($title)) { //Get the article title preg_match_all('/(.*?)<\/titleh>/is', $contents, $matches); foreach($matches[1] as $title1){ $title = "$title1 - ";} } if (!file_exists($cachefile)){ $notfound = "404 Not Found"; $msg = "

$notfound

"; $msg .= "Return Home "; $title = "$title $notfound - "; header($_SERVER["SERVER_PROTOCOL"]." 404 Not Found"); } $doc = new DOMDocument(); libxml_use_internal_errors(true); $doc->loadHTML($contents); $finder = new DomXPath($doc); $node = $finder->query("//*[contains(@class, 'top-header js-cet-subunit')]"); $newbodyheader = ($doc->saveHTML($node->item(0))); $doc = new DOMDocument(); #libxml_use_internal_errors(true); $doc->loadHTML($contents); $finder = new DomXPath($doc); $node = $finder->query("//*[contains(@class, 'entry__content-list js-main-content-list')]"); #$node = $finder->query("//*[contains(@class, 'entry__header entry__header--no-top-media')]"); $newbodystory = ($doc->saveHTML($node->item(0))); $classname = "nav__content"; $dom = new DOMDocument(); $dom->loadHTML($contents); $xpath = new DomXPath($dom); $xpath_results = $xpath->query("//div[contains(@class, '$classname')]"); if($div = $xpath_results->item(0)){ //remove the node the same way $div ->parentNode->removeChild($div); $contents = $dom->saveHTML(); #echo $dom->saveHTML(); } if (empty($newstorybody)){ $finder = new DomXPath($doc); $node = $finder->query("//*[contains(@class, 'entry-head-container')]"); #$newbodystory = ($doc->saveHTML($node->item(0))); } $dom = new DOMDocument(); libxml_use_internal_errors( 1 ); $dom->loadHTML($contents); $xpath = new DOMXpath( $dom ); $jsonScripts = $xpath->query( '//script[@type="application/ld+json"]' ); $json = trim( $jsonScripts->item(0)->nodeValue ); $data = json_decode($json); $json = json_decode($json, true); $dateModified = $json['dateModified']; $datePublished = $json['datePublished']; $timestamp = strtotime('11/23/2011 10:59 am EST'); $datePublishedlinux = strtotime($datePublished); $dateModifiedlinux = strtotime($dateModified); touch($cachefile, $dateModifiedlinux, $datePublishedlinux); touch($amp, $dateModifiedlinux, $datePublishedlinux); include('header.php'); ?>
"; } #echo $newbodyheader; echo $newbodystory; #echo $ampnewbodystory; echo $msg; ?>