| 1 |
<?php
|
| 2 |
|
| 3 |
//Link checker module to check on 404 links created by the link module (http://drupal.org/project/link)
|
| 4 |
//A large majority of this code was lovingly borrowed from the janode project at http://drupal.org/project/janode
|
| 5 |
//THANK YOU janode!
|
| 6 |
|
| 7 |
define("_LINK_CHECKER_NOT_HTTP", 251);
|
| 8 |
define("_LINK_CHECKER_DATA", 252);
|
| 9 |
define("_LINK_CHECKER_FILE_OPEN_FAILURE", 353);
|
| 10 |
define("_LINK_CHECKER_FAILURE_PIVOT", 299);
|
| 11 |
|
| 12 |
/**
|
| 13 |
* Implementation of hook_menu().
|
| 14 |
*/
|
| 15 |
function link_checker_menu($may_cache) {
|
| 16 |
$items = array();
|
| 17 |
if (!$may_cache) {
|
| 18 |
$items[] = array(
|
| 19 |
'path' => 'admin/settings/linkchecker',
|
| 20 |
'callback' => 'drupal_get_form',
|
| 21 |
'callback arguments' => array('link_checker_admin_settings'),
|
| 22 |
'title' => t('Link checker'),
|
| 23 |
'description' => t('Configure link checker'),
|
| 24 |
'access' => user_access('administer link checker'),
|
| 25 |
'type' => MENU_NORMAL_ITEM,
|
| 26 |
);
|
| 27 |
}
|
| 28 |
return $items;
|
| 29 |
}
|
| 30 |
|
| 31 |
/**
|
| 32 |
* Implementation of hook_perm().
|
| 33 |
*/
|
| 34 |
function link_checker_perm(){
|
| 35 |
return array('administer link checker');
|
| 36 |
}
|
| 37 |
|
| 38 |
/**
|
| 39 |
* Administrator settings
|
| 40 |
*
|
| 41 |
* @return Array for FAPI
|
| 42 |
*/
|
| 43 |
function link_checker_admin_settings() {
|
| 44 |
|
| 45 |
//Check to see if allow_url_fopen is available, if not throw an error
|
| 46 |
//so that the administrator knows that this will not work without it
|
| 47 |
if (!ini_get('allow_url_fopen')) {
|
| 48 |
drupal_set_message(t('PHP allow_url_fopen is not enabled in order for this module to work it must be enabled! '));
|
| 49 |
}
|
| 50 |
//Create the settings form
|
| 51 |
$form = array();
|
| 52 |
$form['link_checker_batch_quantity'] = array (
|
| 53 |
'#type' => 'select',
|
| 54 |
'#title' => t('Max links to check per link field'),
|
| 55 |
'#description' => t('Determines the maximum number of links that will be checked for each link field per cron run.
|
| 56 |
A high number will cause cron to run slowly whereas a low number will require cron to be ran more often.
|
| 57 |
'),
|
| 58 |
'#options' => array(1=>1, 2=>2, 5=>5, 10=>10),
|
| 59 |
'#default_value' => variable_get('link_checker_batch_quantity', NULL),
|
| 60 |
);
|
| 61 |
$form['link_checker_unpublish'] = array(
|
| 62 |
'#type' => 'textfield',
|
| 63 |
'#title' => t('Unpublish threshold'),
|
| 64 |
'#description' => t('Number of consecutive CRON runs that detect an error to occur before unpublishing the related node, note, not all links are checked at every cron run due to the above setting. Enter 0 for never unpublish'),
|
| 65 |
'#default_value' => variable_get('link_checker_unpublish', 0),
|
| 66 |
);
|
| 67 |
|
| 68 |
//check for which nodes have link fields
|
| 69 |
$types = array();
|
| 70 |
foreach (content_types() as $node_type) {
|
| 71 |
foreach ($node_type['fields'] as $field) {
|
| 72 |
if ($field['type'] == 'link') {
|
| 73 |
//found a node type with a link field
|
| 74 |
$types[$node_type['type']] = $node_type['name'];
|
| 75 |
}
|
| 76 |
}
|
| 77 |
}
|
| 78 |
|
| 79 |
$form['link_checker_node_types'] = array(
|
| 80 |
'#type' => 'select',
|
| 81 |
'#title' => t('Select the node types to check'),
|
| 82 |
'#description' => t('Unselected items will not be checked for 404 errors'),
|
| 83 |
'#options' => $types,
|
| 84 |
'#default_value' => variable_get('link_checker_node_types', NULL),
|
| 85 |
'#multiple' => true,
|
| 86 |
);
|
| 87 |
|
| 88 |
return system_settings_form($form);
|
| 89 |
}
|
| 90 |
|
| 91 |
/*
|
| 92 |
* Implementation of hook_cron
|
| 93 |
*/
|
| 94 |
function link_checker_cron() {
|
| 95 |
//only search the node types the user wants to search through
|
| 96 |
$node_types = (array)variable_get('link_checker_node_types', NULL);
|
| 97 |
foreach ($node_types as $node) {
|
| 98 |
$fields = content_fields(NULL, $node);
|
| 99 |
foreach ($fields as $field) {
|
| 100 |
if ($field['type'] == 'link') {
|
| 101 |
//We're at the link field type if its multiple we'll query the link table directly
|
| 102 |
//otherwise we'll query the node type table directly
|
| 103 |
if ($field['multiple'] == 1) {
|
| 104 |
$db_info = content_database_info($field);
|
| 105 |
$table_name = $db_info['table'];
|
| 106 |
$field_name = $db_info['columns']['url']['column'];
|
| 107 |
$max = variable_get('link_checker_batch_quantity', 10);
|
| 108 |
$sql = "SELECT c.nid, c.vid, c.delta, c.$field_name as link
|
| 109 |
FROM {$table_name} c
|
| 110 |
LEFT JOIN {link_checker} lc on c.nid = lc.nid AND c.vid = lc.vid AND c.delta = lc.delta
|
| 111 |
WHERE c.$field_name <> ''
|
| 112 |
ORDER BY lc.last_checked ASC LIMIT %d
|
| 113 |
";
|
| 114 |
$links = db_query($sql, $max);
|
| 115 |
while ($link = db_fetch_object($links)) {
|
| 116 |
//send over to _link_checker_check_status to update the status in the DB
|
| 117 |
$link->field_name = $field_name;
|
| 118 |
_link_checker_check_status($link);
|
| 119 |
}
|
| 120 |
}
|
| 121 |
else {
|
| 122 |
//It is a field on the node type table
|
| 123 |
$db_info = content_database_info($field);
|
| 124 |
$table_name = $db_info['table'];
|
| 125 |
$field_name = $db_info['columns']['url']['column'];
|
| 126 |
$max = variable_get('link_checker_batch_quantity', NULL);
|
| 127 |
$sql = "SELECT c.nid, c.vid, c.$field_name as link
|
| 128 |
FROM {$table_name} c
|
| 129 |
LEFT JOIN {link_checker} lc on c.nid = lc.nid AND c.vid = lc.vid AND c.$field_name = lc.field_name
|
| 130 |
WHERE c.$field_name <> ''
|
| 131 |
ORDER BY lc.last_checked ASC LIMIT %d
|
| 132 |
";
|
| 133 |
$links = db_query($sql, $max);
|
| 134 |
while ($link = db_fetch_object($links)) {
|
| 135 |
//send over to _link_checker_check_status to update the status in the DB
|
| 136 |
$link->field_name = $field_name;
|
| 137 |
_link_checker_check_status($link);
|
| 138 |
}
|
| 139 |
}
|
| 140 |
} //if field['type']
|
| 141 |
} //foreach fields as field
|
| 142 |
} //foreach node_types as node
|
| 143 |
}
|
| 144 |
|
| 145 |
/*
|
| 146 |
* Checks the link for a status
|
| 147 |
*/
|
| 148 |
function _link_checker_check_status($link) {
|
| 149 |
static $message_once = TRUE;
|
| 150 |
// check php var "allow_url_fopen" is true as we need it to fetch the URL
|
| 151 |
if (!ini_get('allow_url_fopen')) {
|
| 152 |
if ($message_once) {
|
| 153 |
watchdog('cron', 'PHP INI "allow_url_fopen" is false', WATCHDOG_NOTICE);
|
| 154 |
$message_once = FALSE;
|
| 155 |
return;
|
| 156 |
}
|
| 157 |
}
|
| 158 |
|
| 159 |
static $link_threshold_check;
|
| 160 |
|
| 161 |
//Code originally written in the janode project at http://drupal.org/project/janode, thanks!
|
| 162 |
|
| 163 |
// There are a number of HTTP status return codes. However, below 300
|
| 164 |
// usually means all went ok. We use 250 series for our own internal
|
| 165 |
// error messaging. Our errors are non-fatal however, so are less than
|
| 166 |
// 300. Status codes above 299 are considered errors and we force the
|
| 167 |
// node back into the moderation queue for admin attention.
|
| 168 |
|
| 169 |
$status = 0; // provide a default value to ensure var exists
|
| 170 |
|
| 171 |
// create a full URL
|
| 172 |
$url_parts = parse_url($link->link);
|
| 173 |
if (isset($url_parts['port']) && strlen($url_parts['port']) > 0) {
|
| 174 |
$url_parts['host'] .= ':' . $url_parts['port'];
|
| 175 |
}
|
| 176 |
$url_parts['path'] = (isset($url_parts['path'])) ? $url_parts['path'] : ('');
|
| 177 |
$full_url = $url_parts['scheme'] . "://" . $url_parts['host'] . $url_parts['path'];
|
| 178 |
if (isset($url_parts['query']) && strlen($url_parts['query']) > 0) {
|
| 179 |
$full_url .= "?" . urlencode($url_parts['query']);
|
| 180 |
}
|
| 181 |
|
| 182 |
// currently only support http
|
| 183 |
if ($url_parts['scheme'] != 'http') {
|
| 184 |
$status = _LINK_CHECKER_NOT_HTTP;
|
| 185 |
}
|
| 186 |
else {
|
| 187 |
if(!function_exists('stream_get_meta_data')) { // needed next
|
| 188 |
$status = _LINK_CHECKER_NO_MATA_DATA;
|
| 189 |
}
|
| 190 |
elseif(!($fp = @fopen($full_url, 'r'))) {
|
| 191 |
$status = _LINK_CHECKER_FILE_OPEN_FAILURE;
|
| 192 |
unset($_SESSION['messages']['error']); // rough but gets rid of hostname errors, @fopen didn't work
|
| 193 |
}
|
| 194 |
else {
|
| 195 |
$meta_data = @stream_get_meta_data($fp);
|
| 196 |
fclose($fp);
|
| 197 |
if (is_array($meta_data['wrapper_data'])) {
|
| 198 |
foreach($meta_data['wrapper_data'] as $v) {
|
| 199 |
if (strtolower(substr($v, 0, 4)) == 'http') { // look for a server header starting "http"
|
| 200 |
list($protcol, $status, $verbal) = explode(' ', $v); // and if found, assign to $status
|
| 201 |
break;
|
| 202 |
}
|
| 203 |
}
|
| 204 |
}
|
| 205 |
}
|
| 206 |
}
|
| 207 |
|
| 208 |
// restore system error handler
|
| 209 |
restore_error_handler();
|
| 210 |
|
| 211 |
// tell the db what we have discovered...
|
| 212 |
|
| 213 |
//First we need to check if there is an existing row
|
| 214 |
$sql = "SELECT lc.nid, lc.delta, lc.vid
|
| 215 |
FROM {link_checker} lc
|
| 216 |
WHERE lc.nid = %d AND lc.vid = %d AND lc.delta = %d AND lc.field_name = '%s' LIMIT 1";
|
| 217 |
|
| 218 |
if (db_num_rows(db_query($sql, $link->nid, $link->vid, $link->delta, $link->field_name)) > 0) {
|
| 219 |
//row exist so we'll update it.
|
| 220 |
$sql = "UPDATE {link_checker}
|
| 221 |
SET status = %s, last_checked = %d
|
| 222 |
WHERE nid = %d AND vid = %d AND delta = %d AND field_name = '%s'
|
| 223 |
";
|
| 224 |
db_query($sql,$status, time(), $link->nid, $link->vid, $link->delta, $link->field_name);
|
| 225 |
}
|
| 226 |
else {
|
| 227 |
//row doesn't exist so we'll add it in
|
| 228 |
$sql = "INSERT INTO {link_checker} (nid, vid, delta, last_checked, status, field_name)
|
| 229 |
VALUES (%d, %d, %d, %d, '%s', '%s') ";
|
| 230 |
db_query($sql, $link->nid, $link->vid, $link->delta, time(), $status, $link->field_name);
|
| 231 |
}
|
| 232 |
|
| 233 |
// this seems to execute more than once for the same field, so better be safe
|
| 234 |
if(@!isset($link_threshold_check["{$link->vid}-{$link->delta}"])) {
|
| 235 |
// unpublish and force the node into the moderation queue
|
| 236 |
if ((int)$status > _LINK_CHECKER_FAILURE_PIVOT && variable_get('link_checker_unpublish',0) >0) {
|
| 237 |
//The user would like all errors above 300 unpublished.. ok we'll do that.
|
| 238 |
db_query("UPDATE {link_checker} set error_count = error_count+1 where vid=%d and delta =%d",$link->vid, $link->delta );
|
| 239 |
$result = db_query("SELECT error_count from {link_checker} where vid=%d and delta =%d",$link->vid, $link->delta );
|
| 240 |
$stat = db_fetch_array($result);
|
| 241 |
if($stat['error_count'] >= variable_get('link_checker_unpublish',0)) {
|
| 242 |
// @todo you could work in someting from actions module here instead?
|
| 243 |
db_query("UPDATE {node} SET status = 0, moderate = 1 WHERE nid = %d", $link->nid);
|
| 244 |
}
|
| 245 |
} else {
|
| 246 |
// we dont care for, or we need to reset the error_count
|
| 247 |
db_query("UPDATE {link_checker} set error_count = 0 where vid=%d and delta =%d",$link->vid, $link->delta );
|
| 248 |
}
|
| 249 |
}
|
| 250 |
$link_threshold_check["{$link->vid}-{$link->delta}"]=true;
|
| 251 |
}
|