$end) { return 0; } $num_children = $this->locum_config[harvest_config][max_children]; $num_to_process = $end - $start; $increment = ceil($num_to_process / $num_children); if (extension_loaded('pcntl') && $this->locum_config[harvest_config][harvest_with_children] && ($num_to_process >= (2 * $num_children))) { for ($i = 0; $i < $num_children; ++$i) { $end = $start + ($increment - 1); $new_start = $end + 1; $pid = pcntl_fork(); if ($pid != -1) { if ($pid) { parent::putlog("Spawning child harvester to scan records $start - $end. PID is $pid .."); } else { sleep(1); ++$i; if ($i == $num_children) { $end++; } $result = self::import_bibs($start, $end); parent::putlog("Child process complete. Scanned records $start - $end. Imported $result[imported] reords and skipped $result[skipped] ..", 2); exit($i); } } else { parent::putlog("Unable to spawn harvester: ($i)", 5); } $start = $new_start; } if ($pid) { while ($i > 0) { pcntl_waitpid(-1, &$status); $val = pcntl_wexitstatus($status); --$i; } parent::putlog("Harvest complete!", 3); } } else { $result = self::import_bibs($start, $end); } } /** * Does the actual import of bib records. Called by the harvester. * It uses start and end parameters because this function can potentially be called by a * child process. * * @param int $start Bib number to start with * @param int $end Bib number to end with * @return array Array of information about the bibs imported */ public function import_bibs($start, $end) { $db =& MDB2::connect($this->dsn); $process_report[skipped] = 0; $process_report[imported] = 0; for ($i = $start; $i <= $end; $i++) { $bib = $this->locum_cntl->scrape_bib($i); if ($bib == FALSE) { $process_report[skipped]++; } else { $subj = array_pop($bib); $valid_vals = array('bnum', 'author', 'addl_author', 'title', 'title_medium', 'edition', 'series', 'callnum', 'pub_info', 'pub_year', 'stdnum', 'lccn', 'descr', 'notes', 'subjects_ser', 'lang', 'loc_code', 'mat_code', 'cover_img', 'bib_created', 'bib_lastupdate', 'bib_prevupdate', 'bib_revs'); foreach ($bib as $bkey => $bval) { if (in_array($bkey, $valid_vals)) { $bib_values[$bkey] = $bval; } } $bib_values[subjects_ser] = serialize($subj); $types = array('integer', 'text', 'text', 'text', 'text', 'text', 'text', 'text', 'text', 'text', 'text', 'text', 'text', 'text', 'text', 'text', 'text', 'text', 'date', 'date', 'date', 'integer'); $sql_prep = $db->prepare('INSERT INTO locum_bib_items VALUES (:bnum, :author, :addl_author, :title, :title_medium, :edition, :series, :callnum, :pub_info, :pub_year, :stdnum, :lccn, :descr, :notes, :subjects_ser, :lang, :loc_code, :mat_code, :cover_img, NOW(), :bib_created, :bib_lastupdate, :bib_prevupdate, :bib_revs, \'1\')'); $affrows = $sql_prep->execute($bib_values); $sql_prep->free(); if (is_array($subj) && count($subj)) { foreach ($subj as $subj_heading) { $insert_data = array($bib[bnum], $subj_heading); $types = array('integer', 'text'); $sql_prep = $db->prepare('INSERT INTO locum_bib_items_subject VALUES (?, ?)', $types, MDB2_PREPARE_MANIP); $affrows = $sql_prep->execute($insert_data); $sql_prep->free(); } } $process_report[imported]++; } } $db->disconnect(); return $process_report; } /** * Scans existing imported bibs for changes or weeds and makes the appropriate changes. * * @param boolean $quiet Run this function silently. Default: TRUE */ public function verify_bibs($quiet = TRUE) { parent::putlog("Collecting current data keys .."); $db =& MDB2::connect($this->dsn); $sql = 'SELECT bnum, bib_lastupdate FROM locum_facet_heap'; $init_result =& $db->query($sql); $init_bib_arr = $init_result->fetchAll(MDB2_FETCHMODE_ASSOC); $num_children = $this->locum_config[harvest_config][max_children]; $num_to_process = count($init_bib_arr); foreach ($init_bib_arr as $init_bib_arr_vals) { $bib_arr[$init_bib_arr_vals[bnum]] = $init_bib_arr_vals[bib_lastupdate]; } $db->disconnect(); parent::putlog("Finished collecting data keys."); if (extension_loaded('pcntl') && $this->locum_config[harvest_config][harvest_with_children] && ($num_to_process >= (2 * $num_children))) { $increment = ceil($num_to_process / $num_children); $split_offset = 0; for ($i = 0; $i < $num_children; ++$i) { $end = $start + ($increment - 1); $new_start = $end + 1; $pid = pcntl_fork(); if ($pid != -1) { if ($pid) { parent::putlog("Spawning child harvester to verify records. PID is $pid .."); } else { sleep(1); ++$i; if ($i == $num_children) { $end++; } $bib_arr_sliced = array_slice($bib_arr, $split_offset, $increment, TRUE); $num_bibs = count($bib_arr_sliced); $tmp = self::update_bib($bib_arr_sliced); $updated = $tmp[updated]; $retired = $tmp[retired]; parent::putlog("Child process complete. Checked $num_bibs records, updated $updated records, retired $retired records.", 2); exit($i); } } else { parent::putlog("Unable to spawn harvester: ($i)", 5); } $start = $new_start; $split_offset = $split_offset + $increment; } if ($pid) { while ($i > 0) { pcntl_waitpid(-1, &$status); $val = pcntl_wexitstatus($status); --$i; } parent::putlog("Verification complete!", 3); } } else { // TODO - Bib verification for those poor saps w/o pcntl } } /** * Does the actual update of the bib record if something has changed. * This function is called by verify_bibs() * * @param array $bib_arr Array of bibs like: key => val is bnum => last update date * @return array Array of # updated and # retired */ public function update_bib($bib_arr) { $db = MDB2::connect($this->dsn); $updated = 0; $retired = 0; foreach ($bib_arr as $bnum => $init_bib_date) { $bib = $this->locum_cntl->scrape_bib($bnum, TRUE); if ($bib == FALSE) { // TODO add a verification of weed in here somehow $sql_prep =& $db->prepare('UPDATE locum_bib_items SET active = ? WHERE bnum = ?', array('text', 'integer')); $sql_prep->execute(array('0', $bnum)); $sql_prep =& $db->prepare('DELETE FROM locum_bib_items_subject WHERE bnum = ?', array('integer')); $sql_prep->execute(array($bnum)); $sql_prep->free(); $retired++; } else if ($bib[bnum] && $bib[bib_lastupdate] != $init_bib_date) { $subj = array_pop($bib); $valid_vals = array('bib_created', 'bib_lastupdate', 'bib_prevupdate', 'bib_revs', 'lang', 'loc_code', 'mat_code', 'author', 'addl_author', 'title', 'title_medium', 'edition', 'series', 'callnum', 'pub_info', 'pub_year', 'stdnum', 'lccn', 'descr', 'notes', 'bnum'); foreach ($bib as $bkey => $bval) { if (in_array($bkey, $valid_vals)) { $bib_values[$bkey] = $bval; } } $bib_values[subjects_ser] = serialize($subj); $types = array('date', 'date', 'date', 'integer', 'text', 'text', 'text', 'text', 'text', 'text', 'text', 'text', 'text', 'text', 'text', 'integer', 'text', 'text', 'text', 'text', 'text', 'integer'); $setlist = "bib_created = :bib_created, " . "bib_lastupdate = :bib_lastupdate, " . "bib_prevupdate = :bib_prevupdate, " . "bib_revs = :bib_revs, " . "lang = :lang, " . "loc_code = :loc_code, " . "mat_code = :mat_code, " . "author = :author, " . "addl_author = :addl_author, " . "title = :title, " . "title_medium = :title_medium, " . "edition = :edition, " . "series = :series, " . "callnum = :callnum, " . "pub_info = :pub_info, " . "pub_year = :pub_year, " . "stdnum = :stdnum, " . "lccn = :lccn, " . "descr = :descr, " . "notes = :notes, " . "subjects = :subjects_ser, " . "modified = NOW()"; $sql_prep =& $db->prepare('UPDATE locum_bib_items SET ' . $setlist . ' WHERE bnum = :bnum', $types, MDB2_PREPARE_MANIP); $res = $sql_prep->execute($bib_values); $sql_prep =& $db->prepare('DELETE FROM locum_bib_items_subject WHERE bnum = ?', array('integer')); $sql_prep->execute(array($bnum)); $sql_prep->free(); if (is_array($subj) && count($subj)) { foreach ($subj as $subj_heading) { $insert_data = array($bnum, $subj_heading); $types = array('integer', 'text'); $sql_prep =& $db->prepare('INSERT INTO locum_bib_items_subject VALUES (?, ?)', $types, MDB2_PREPARE_MANIP); $affrows = $sql_prep->execute($insert_data); $sql_prep->free(); } } parent::putlog("Updated record # $bnum", 2, TRUE); $updated++; } } $db->disconnect(); return array('retired' => $retired, 'updated' => $updated); } /** * Scans for newly cataloged bib records. * Uses the ini "harvest_reach" param to determine how far forward to seek. */ public function new_bib_scan() { $db = MDB2::connect($this->dsn); $sql = 'SELECT MAX(bnum) FROM locum_bib_items'; $max_bib_result =& $db->query($sql); $max_bib = $max_bib_result->fetchOne(); $next_bib = $max_bib + 1; $last_bib = $next_bib + $this->locum_config[harvest_config][harvest_reach]; $db->disconnect(); self::harvest_bibs($next_bib, $last_bib); } /** * Flushes the holds_count table and rebuilds it. Useful for keeping popularity information * up-to-date. It's needed in this format so that the sphinx index can be rebuilt with * dortable popularity data. */ public function rebuild_holds_cache() { $db = MDB2::connect($this->dsn); $db->query('DELETE FROM locum_holds_count'); $db->query('INSERT INTO locum_holds_count (bnum) SELECT locum_bib_items.bnum FROM locum_bib_items'); $counts = array('week', 'month', 'year', 'total'); $sql_week = 'SELECT bnum, COUNT(bnum) AS total FROM locum_holds_placed WHERE hold_date >= DATE_SUB(CURDATE(), INTERVAL 1 WEEK) GROUP BY bnum'; $sql_month = 'SELECT bnum, COUNT(bnum) AS total FROM locum_holds_placed WHERE hold_date >= DATE_SUB(CURDATE(), INTERVAL 1 MONTH) GROUP BY bnum'; $sql_year = 'SELECT bnum, COUNT(bnum) AS total FROM locum_holds_placed WHERE hold_date >= DATE_SUB(CURDATE(), INTERVAL 1 YEAR) GROUP BY bnum'; $sql_total = 'SELECT bnum, COUNT(bnum) AS total FROM locum_holds_placed GROUP BY bnum'; foreach ($counts as $count_type) { $dbq =& $db->query(${'sql_' . $count_type}); $result_arr = $dbq->fetchAll(MDB2_FETCHMODE_ASSOC); foreach ($result_arr as $result) { $db->query('UPDATE locum_holds_count SET hold_count_' . $count_type . ' = ' . $result[total] . ' WHERE bnum = ' . $result[bnum]); } } } public function rebuild_facet_heap() { $db = MDB2::connect($this->dsn); $db->exec('DELETE FROM locum_facet_heap'); $db->exec('INSERT INTO locum_facet_heap SELECT bnum, series, mat_code, loc_code, lang, pub_year, bib_lastupdate FROM locum_bib_items WHERE active = \'1\''); } /** * Grabs the cover image URL for caching (much faster on the front-end to do it this way). * Will try amazon if the ini says so. * * @param string $stdnum_raw - stdnum/ISBN from the bib record * @return string Image URL or NULL */ public function get_cover_img($stdnum_raw) { // Format stdnum as best we can if (preg_match('/ /', $stdnum_raw)) { $stdnum_arr = explode(' ', $stdnum_raw); $stdnum = trim($stdnum_arr[0]); } else { $stdnum = trim($stdnum_raw); } $api_cfg = $this->locum_config[api_config]; $image_url = ''; if ($api_cfg[use_amazon_images] && $api_cfg[use_syndetic_images]) { if ($api_cfg[amazon_img_prio] >= $api_cfg[syndetic_img_prio]) { $image_url = self::get_amazon_image($stdnum, $api_cfg[amazon_access_key]); if (!$image_url) { $image_url = self::get_syndetic_image($stdnum, $api_cfg[syndetic_custid]); } } else { $image_url = self::get_syndetic_image($stdnum, $api_cfg[syndetic_custid]); if (!$image_url) { $image_url = self::get_amazon_image($stdnum, $api_cfg[amazon_access_key]); } } } else if ($api_cfg[use_amazon_images]) { $image_url = self::get_amazon_image($stdnum, $api_cfg[amazon_access_key]); } else if ($api_cfg[use_syndetic_images]) { $image_url = self::get_syndetic_image($stdnum, $api_cfg[syndetic_custid]); } return $image_url; } /** * Used by get_cover_img to get the Amazon cover image URL. * You'll need to put in your own Amazon API key into the ini. * * @param string $stdnum Stdnum/ISBN * @param string $api_key Amazon API key - they're free. Go git one. * @return string Cover image URL */ public function get_amazon_image($stdnum, $api_key) { $url = 'http://webservices.amazon.com/onca/xml?Service=AWSECommerceService'; $url.= "&AWSAccessKeyId=$api_key"; $url.= "&Operation=ItemLookup&IdType=ASIN&ItemId=$stdnum"; $url.= '&ResponseGroup=Medium,OfferFull'; $az_dl = @file_get_contents($url); list($version, $status_code, $msg) = explode(' ', $http_response_header[0], 3); if ($status_code == '200') { $az = simplexml_load_string($az_dl); if (is_object($az->Items)) { if ($az->Items->Item->MediumImage->URL) { $image_url = trim($az->Items->Item->MediumImage->URL); } } } return $image_url; } /** * Used by get_cover_img to get the Syndetics cover image URL. * You'll need to put in your own customer ID into the ini. * * @param string $stdnum Stdnum/ISBN * @param string $cust_id Your syndetics ID - it's overpriced. Go git one. * @return string Cover image URL */ public function get_syndetic_image($stdnum, $cust_id) { $image_url = ''; $url = 'http://syndetics.com/index.aspx?isbn=' . $stdnum . '/index.xml&client=' . $cust_id . '&type=xw10'; $syn_dl = @file_get_contents($url); list($version, $status_code, $msg) = explode(' ', $http_response_header[0], 3); if (preg_match('/xml/', $syn_dl) && $status_code == '200') { $syn = simplexml_load_string($syn_dl); if ($syn->SC == 'SC.GIF') { $image_url = 'http://syndetics.com/hw7.pl?isbn=' . $stdnum . '/SC.GIF&client=' . $cust_id; $img_size = @getimagesize($image_url); if ($img_size[0] == 1) { $image_url = ''; } } } return $image_url; } }