
<?php //It downloads MIT OpenCourseWare PDF file(s) from https://ocw.mit.edu //How to execute the current script? //Make sure your internet connection is on. //Step 1: Install XAMPP server and start Apache //and Mysql tasks from the control panel //Step 2: Go to htdocs folder and create a folder PDF. //Step 3: Copy simple_html_dom.php to htdocs folder //Step 4: Copy this file (download_all_pdf_files.php) to htdocs folder. //Step 5: Open browser and type "http://localhost/download_all_pdf_files.php" //Step 6: All the pdf files will be saved in PDF folder under htdocs include "simple_html_dom.php"; //PHP DOM html parser //Please download the complete project from here: //https://sourceforge.net/projects/simplehtmldom/ //The above file must be included from the project and must //be present where the script is executed //It must be present in htdocs folder while running this script. function download_file_from_url($url, $path) { $new_file_name = $path; $file = fopen ($url, "rb"); if ($file) { $newf = fopen ($new_file_name, "wb"); if ($newf) while(!feof($file)) { fwrite($newf, fread($file, 1024 * 8 ), 1024 * 8 ); } } if ($file) { fclose($file); } if ($newf) { fclose($newf); } } function get_all_pdf_links($website_url) { $html = file_get_html($website_url); $all_ahref_links = array(); $link_counter=0; //Find all links foreach($html->find('a') as $element) { $all_ahref_links[$link_counter++] = $element->href; } $pdf_links_list = array(); $pdf_link_count = 0; $total_links = count($all_ahref_links); for ($link_counter=0;$link_counter<$total_links;$link_counter++) { if (strpos($all_ahref_links[$link_counter], '.pdf') !== false) { $pdf_links_list[$pdf_link_count++] = $all_ahref_links[$link_counter]; } } return $pdf_links_list; } //Change this URL to your target web page as per your //requirement $target_url = 'https://ocw.mit.edu/courses/ mathematics/18-440-probability-and-random-variables-spring-2014/lecture-notes/'; $pdf_links_array = get_all_pdf_links($target_url); //download all pdf file(s) $pdf_counter = 0; for ($pdf_counter=0;$pdf_counter<count($pdf_links_array);$pdf_counter++) { //Change here base URL to your target as per your requirement $complete_pdf_url = "https://ocw.mit.edu".$pdf_links_array[$pdf_counter]; $pdf_path_file_names_token = explode("/",$pdf_links_array[$pdf_counter]); $pdf_name = "PDF/".$pdf_path_file_names_token[count($pdf_path_file_names_token)-1]; echo "<br>Downloading from...".$complete_pdf_url."...to...".$pdf_name."..."; download_file_from_url($complete_pdf_url,$pdf_name); } exit; ?>
There are many ready-made plugins and source code available for web scrapping. Following is the list where you can purchase it directly :
WordPress & WooCommerce Scraper Plugin, Import Data from Any Site
Just Dial Scraper and Extractor