| 1 |
<?php |
<?php |
| 2 |
// $Id: legis_bills.inc,v 1.6 2008/01/17 20:38:24 drumm Exp $ |
// $Id: legis_bills.inc,v 1.7 2008/10/30 03:21:49 drumm Exp $ |
| 3 |
|
|
| 4 |
function legislature_ia_bills() { |
function legislature_ia_bills() { |
| 5 |
set_time_limit(0); |
set_time_limit(0); |
| 157 |
// Start parsing <tr> tags and creating bill history array where an element = Date, Event description |
// Start parsing <tr> tags and creating bill history array where an element = Date, Event description |
| 158 |
$data['actions'] = array(); |
$data['actions'] = array(); |
| 159 |
foreach ($xml->body->table[1]->tr as $row) { |
foreach ($xml->body->table[1]->tr as $row) { |
| 160 |
$action = array ( |
$action = array( |
| 161 |
'date' => date_parse(preg_replace('/\s+/', ' ', $row->td[0]->font)), |
'date' => date_parse(preg_replace('/\s+/', ' ', $row->td[0]->font)), |
| 162 |
'description' => trim(preg_replace('/<[^>]*>([^<.,]*).*/', '\1', preg_replace('/\s+/', ' ', $row->td[1]->font->asXML()))), |
'description' => trim(preg_replace('/<[^>]*>([^<.,]*).*/', '\1', preg_replace('/\s+/', ' ', $row->td[1]->font->asXML()))), |
| 163 |
'motion' => preg_replace('/\s+/', ' ', strip_tags($row->td[1]->font->asXML())), |
'motion' => preg_replace('/\s+/', ' ', strip_tags($row->td[1]->font->asXML())), |
| 184 |
$action['outcome'] = $passed[$matches[1]]; |
$action['outcome'] = $passed[$matches[1]]; |
| 185 |
$action['ayes'] = $matches[3]; |
$action['ayes'] = $matches[3]; |
| 186 |
$action['noes'] = $matches[4]; |
$action['noes'] = $matches[4]; |
| 187 |
|
|
| 188 |
|
$journal = drupal_http_request(preg_replace('/#.*/', '', $row->td[1]->font->a['href'])); |
| 189 |
|
$result_string = legislature_pdf2string($journal->data); |
| 190 |
|
|
| 191 |
|
$votes = array( |
| 192 |
|
'Yeas' => NULL, |
| 193 |
|
'Nays' => NULL, |
| 194 |
|
'Present' => NULL, |
| 195 |
|
'Absent' => NULL, |
| 196 |
|
); |
| 197 |
|
|
| 198 |
|
$measure = str_replace(array('HF', 'SF'), array('H.F.', 'S.F.'), $bill['measure']); |
| 199 |
|
|
| 200 |
|
// Extract text lines after the S.F bill number to ensure we get |
| 201 |
|
// the lines that correspond to the correct bill number |
| 202 |
|
$entries = array(); |
| 203 |
|
foreach (explode('On the question', $result_string) as $entry) { |
| 204 |
|
if (preg_match("/\(". $measure ."\)/", $entry) == 1) { |
| 205 |
|
array_push($entries, $entry); |
| 206 |
|
} |
| 207 |
|
} |
| 208 |
|
|
| 209 |
|
// Extract text lines based on 2 conditions, we've seen "Shall the bill pass" and S.F. (bill number) |
| 210 |
|
foreach($entries as $value) { |
| 211 |
|
if (preg_match("/Shall the bill pass?/", $value) == 1 && preg_match('/'. $measure .'/', $value) == 1) { |
| 212 |
|
$split_result_string_on_the_bill = explode("The bill", $value); |
| 213 |
|
} |
| 214 |
|
} |
| 215 |
|
$unformatted_text = $split_result_string_on_the_bill[0]; |
| 216 |
|
$unformatted_text = preg_replace("/\d+ JOURNAL OF THE SENATE .* Day /", "", $unformatted_text); |
| 217 |
|
|
| 218 |
|
// This section attempts to format the vote text for easy parsing |
| 219 |
|
// Use # as a way to easily split the string |
| 220 |
|
$unformatted_text = substr($unformatted_text, strpos($unformatted_text, ':') + 1); |
| 221 |
|
$unformatted_text = str_replace(', none', ', 0: none', $unformatted_text); |
| 222 |
|
$unformatted_text = str_replace('.', '', $unformatted_text); |
| 223 |
|
$unformatted_text = str_replace('Yeas', '#Yeas', $unformatted_text); |
| 224 |
|
$unformatted_text = str_replace('Nays', '#Nays', $unformatted_text); |
| 225 |
|
$unformatted_text = str_replace('Present', '#Present', $unformatted_text); |
| 226 |
|
$unformatted_text = str_replace('Absent', '#Absent', $unformatted_text); |
| 227 |
|
|
| 228 |
|
foreach(explode('#', $unformatted_text) as $position) { |
| 229 |
|
if (!empty($position)) { |
| 230 |
|
$pos_array = explode(':', $position); |
| 231 |
|
$pos_vote_count = explode(',', $pos_array[0]); |
| 232 |
|
$pos_names = explode(' ', $pos_array[1]); |
| 233 |
|
$pos_count = trim($pos_vote_count[1]); |
| 234 |
|
if ($pos_count != '0') { |
| 235 |
|
$cleaned_names = array(); |
| 236 |
|
for ($i=0; $i < count($pos_names); $i += 1) { |
| 237 |
|
if (preg_match("/\d/",$pos_names[$i])==1){ |
| 238 |
|
$i += 6; |
| 239 |
|
} |
| 240 |
|
elseif (!empty($pos_names[$i])) { |
| 241 |
|
array_push($cleaned_names, $pos_names[$i]); |
| 242 |
|
} |
| 243 |
|
} |
| 244 |
|
$votes[$pos_vote_count[0]] = $cleaned_names; |
| 245 |
|
} |
| 246 |
|
} |
| 247 |
|
} |
| 248 |
|
dpr($votes); |
| 249 |
|
|
| 250 |
|
$data['actions'][] = $action; |
| 251 |
} |
} |
| 252 |
|
} |
| 253 |
|
} |
| 254 |
|
|
| 255 |
// todo figure out bill status |
/** |
| 256 |
|
* Convert pdf to plain text. |
| 257 |
|
* |
| 258 |
|
* @param $content |
| 259 |
|
* PDF document |
| 260 |
|
* @return |
| 261 |
|
* Plain text of pdf with the position of the string set to beginning. |
| 262 |
|
*/ |
| 263 |
|
function legislature_pdf2string($content) { |
| 264 |
|
$searchstart = 'stream'; |
| 265 |
|
$searchend = 'endstream'; |
| 266 |
|
$pdfText = ''; |
| 267 |
|
$pos = 0; |
| 268 |
|
$pos2 = 0; |
| 269 |
|
$startpos = 0; |
| 270 |
|
|
| 271 |
|
while ($pos !== FALSE && $pos2 !== FALSE) { |
| 272 |
|
$pos = strpos($content, $searchstart, $startpos); |
| 273 |
|
$pos2 = strpos($content, $searchend, $startpos + 1); |
| 274 |
|
|
| 275 |
|
if ($pos !== FALSE && $pos2 !== FALSE){ |
| 276 |
|
if ($content[$pos] == 0x0d && $content[$pos + 1] == 0x0a) { |
| 277 |
|
$pos += 2; |
| 278 |
|
} |
| 279 |
|
elseif ($content[$pos] == 0x0a) { |
| 280 |
|
$pos += 1; |
| 281 |
|
} |
| 282 |
|
|
| 283 |
$data['actions'][] = $action; |
if ($content[$pos2 - 2] == 0x0d && $content[$pos2 - 1] == 0x0a) { |
| 284 |
|
$pos2 -= 2; |
| 285 |
|
} |
| 286 |
|
elseif ($content[$pos2 - 1] == 0x0a) { |
| 287 |
|
$pos2 -= 1; |
| 288 |
|
} |
| 289 |
|
|
| 290 |
|
$textsection = substr($content, $pos + strlen($searchstart) + 2, $pos2 - $pos - strlen($searchstart) - 1); |
| 291 |
|
$data = @gzuncompress($textsection); |
| 292 |
|
$pdfText .= legislature_pdf_extract_text($data); |
| 293 |
|
$startpos = $pos2 + strlen($searchend) - 1; |
| 294 |
|
} |
| 295 |
} |
} |
| 296 |
|
|
| 297 |
/* |
return preg_replace('/(\s)+/', ' ', $pdfText); |
| 298 |
// We re-parse the page here to get the link(s) associated with a given entry |
} |
| 299 |
// Each the link goes to a senate journal in pdf form. The links contain |
|
| 300 |
// spaces which are escaped for parsing |
/** |
| 301 |
// todo merge into upper loop |
* Used for pdf conversion. |
| 302 |
// todo go ahead and get votes |
* |
| 303 |
$parser = new HtmlParser($html_page); |
* @param $psData |
| 304 |
$links = get_all_links_after_hr_tag($parser); |
* Uncompressed data. |
| 305 |
$links = create_escaped_links($links); |
* @return |
| 306 |
|
* Plain text of pdf. |
| 307 |
// Append link(s) to given bill history entry |
*/ |
| 308 |
for ( $i = 0; $i < count($bill_history); $i += 1) { |
function legislature_pdf_extract_text($psData){ |
| 309 |
array_push($bill_history[$i]["journal_links"], $links[$i]); |
if (!is_string($psData)) { |
| 310 |
|
return ''; |
| 311 |
} |
} |
| 312 |
*/ |
|
| 313 |
|
$text = ''; |
| 314 |
|
|
| 315 |
|
// Handle brackets in the text stream that could be mistaken for the end of a |
| 316 |
|
// text field. I'm sure you can do this as part of the regular expression, |
| 317 |
|
// but my skills aren't good enough yet. |
| 318 |
|
$psData = str_replace('\)', '##ENDBRACKET##', $psData); |
| 319 |
|
$psData = str_replace('\]', '##ENDSBRACKET##', $psData); |
| 320 |
|
|
| 321 |
|
preg_match_all('/(T[wdcm*])[\s]*(\[([^\]]*)\]|\(([^\)]*)\))[\s]*Tj/si', $psData, $matches); |
| 322 |
|
for ($i = 0; $i < sizeof($matches[0]); $i += 1) { |
| 323 |
|
if ($matches[3][$i] != '') { |
| 324 |
|
// Run another match over the contents. |
| 325 |
|
preg_match_all('/\(([^)]*)\)/si', $matches[3][$i], $subMatches); |
| 326 |
|
foreach ($subMatches[1] as $subMatch) { |
| 327 |
|
$text .= $subMatch; |
| 328 |
|
} |
| 329 |
|
} else if ($matches[4][$i] != '') { |
| 330 |
|
$text .= ($matches[1][$i] == 'Tc' ? ' ' : '') . $matches[4][$i]; |
| 331 |
|
} |
| 332 |
|
} |
| 333 |
|
|
| 334 |
|
// Translate special characters and put back brackets. |
| 335 |
|
$trans = array( |
| 336 |
|
'...' => '…', |
| 337 |
|
'\205' => '…', |
| 338 |
|
'\221' => chr(145), |
| 339 |
|
'\222' => chr(146), |
| 340 |
|
'\223' => chr(147), |
| 341 |
|
'\224' => chr(148), |
| 342 |
|
'\226' => '-', |
| 343 |
|
'\267' => '•', |
| 344 |
|
'\(' => '(', |
| 345 |
|
'\[' => '[', |
| 346 |
|
'##ENDBRACKET##' => ')', |
| 347 |
|
'##ENDSBRACKET##' => ']', |
| 348 |
|
chr(133) => '-', |
| 349 |
|
chr(141) => chr(147), |
| 350 |
|
chr(142) => chr(148), |
| 351 |
|
chr(143) => chr(145), |
| 352 |
|
chr(144) => chr(146), |
| 353 |
|
); |
| 354 |
|
return strtr($text, $trans); |
| 355 |
} |
} |