/[drupal]/contributions/modules/legislature/imports/ia/legis_bills.inc
ViewVC logotype

Diff of /contributions/modules/legislature/imports/ia/legis_bills.inc

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph | View Patch Patch

revision 1.7, Thu Oct 30 03:21:49 2008 UTC revision 1.8, Thu Jan 1 00:56:19 2009 UTC
# Line 1  Line 1 
1  <?php  <?php
2  // $Id: legis_bills.inc,v 1.6 2008/01/17 20:38:24 drumm Exp $  // $Id: legis_bills.inc,v 1.7 2008/10/30 03:21:49 drumm Exp $
3    
4  function legislature_ia_bills() {  function legislature_ia_bills() {
5    set_time_limit(0);    set_time_limit(0);
# Line 157  function legislature_ia_bill_history($ht Line 157  function legislature_ia_bill_history($ht
157    // Start parsing <tr> tags and creating bill history array where an element = Date, Event description    // Start parsing <tr> tags and creating bill history array where an element = Date, Event description
158    $data['actions'] = array();    $data['actions'] = array();
159    foreach ($xml->body->table[1]->tr as $row) {    foreach ($xml->body->table[1]->tr as $row) {
160      $action = array (      $action = array(
161        'date' => date_parse(preg_replace('/\s+/', ' ', $row->td[0]->font)),        'date' => date_parse(preg_replace('/\s+/', ' ', $row->td[0]->font)),
162        'description' => trim(preg_replace('/<[^>]*>([^<.,]*).*/', '\1', preg_replace('/\s+/', ' ', $row->td[1]->font->asXML()))),        'description' => trim(preg_replace('/<[^>]*>([^<.,]*).*/', '\1', preg_replace('/\s+/', ' ', $row->td[1]->font->asXML()))),
163        'motion' => preg_replace('/\s+/', ' ', strip_tags($row->td[1]->font->asXML())),        'motion' => preg_replace('/\s+/', ' ', strip_tags($row->td[1]->font->asXML())),
# Line 184  function legislature_ia_bill_history($ht Line 184  function legislature_ia_bill_history($ht
184        $action['outcome'] = $passed[$matches[1]];        $action['outcome'] = $passed[$matches[1]];
185        $action['ayes'] = $matches[3];        $action['ayes'] = $matches[3];
186        $action['noes'] = $matches[4];        $action['noes'] = $matches[4];
187    
188          $journal = drupal_http_request(preg_replace('/#.*/', '', $row->td[1]->font->a['href']));
189          $result_string = legislature_pdf2string($journal->data);
190    
191          $votes = array(
192            'Yeas'    => NULL,
193            'Nays'    => NULL,
194            'Present' => NULL,
195            'Absent'  => NULL,
196          );
197    
198          $measure = str_replace(array('HF', 'SF'), array('H.F.', 'S.F.'), $bill['measure']);
199    
200          // Extract text lines after the S.F bill number to ensure we get
201          // the lines that correspond to the correct bill number
202          $entries = array();
203          foreach (explode('On the question', $result_string) as $entry) {
204            if (preg_match("/\(". $measure ."\)/", $entry) == 1) {
205              array_push($entries, $entry);
206            }
207          }
208    
209          // Extract text lines based on 2 conditions, we've seen "Shall the bill pass" and S.F. (bill number)
210          foreach($entries as $value) {
211            if (preg_match("/Shall the bill pass?/", $value) == 1 && preg_match('/'. $measure .'/', $value) == 1) {
212              $split_result_string_on_the_bill = explode("The bill", $value);
213            }
214          }
215          $unformatted_text = $split_result_string_on_the_bill[0];
216          $unformatted_text = preg_replace("/\d+ JOURNAL OF THE SENATE .* Day /", "", $unformatted_text);
217    
218          // This section attempts to format the vote text for easy parsing
219          // Use # as a way to easily split the string
220          $unformatted_text = substr($unformatted_text, strpos($unformatted_text, ':') + 1);
221          $unformatted_text = str_replace(', none', ', 0: none', $unformatted_text);
222          $unformatted_text = str_replace('.', '', $unformatted_text);
223          $unformatted_text = str_replace('Yeas', '#Yeas', $unformatted_text);
224          $unformatted_text = str_replace('Nays', '#Nays', $unformatted_text);
225          $unformatted_text = str_replace('Present', '#Present', $unformatted_text);
226          $unformatted_text = str_replace('Absent', '#Absent', $unformatted_text);
227    
228          foreach(explode('#', $unformatted_text) as $position) {
229            if (!empty($position)) {
230              $pos_array = explode(':', $position);
231              $pos_vote_count = explode(',', $pos_array[0]);
232              $pos_names = explode(' ', $pos_array[1]);
233              $pos_count = trim($pos_vote_count[1]);
234              if ($pos_count != '0') {
235                $cleaned_names = array();
236                for ($i=0; $i < count($pos_names); $i += 1) {
237                  if (preg_match("/\d/",$pos_names[$i])==1){
238                    $i += 6;
239                  }
240                  elseif (!empty($pos_names[$i])) {
241                    array_push($cleaned_names, $pos_names[$i]);
242                  }
243                }
244                $votes[$pos_vote_count[0]] = $cleaned_names;
245              }
246            }
247          }
248          dpr($votes);
249    
250          $data['actions'][] = $action;
251      }      }
252      }
253    }
254    
255      // todo figure out bill status  /**
256     * Convert pdf to plain text.
257     *
258     * @param $content
259     *   PDF document
260     * @return
261     *   Plain text of pdf with the position of the string set to beginning.
262     */
263    function legislature_pdf2string($content) {
264      $searchstart = 'stream';
265      $searchend = 'endstream';
266      $pdfText = '';
267      $pos = 0;
268      $pos2 = 0;
269      $startpos = 0;
270    
271      while ($pos !== FALSE && $pos2 !== FALSE) {
272        $pos = strpos($content, $searchstart, $startpos);
273        $pos2 = strpos($content, $searchend, $startpos + 1);
274    
275        if ($pos !== FALSE && $pos2 !== FALSE){
276          if ($content[$pos] == 0x0d && $content[$pos + 1] == 0x0a) {
277            $pos += 2;
278          }
279          elseif ($content[$pos] == 0x0a) {
280            $pos += 1;
281          }
282    
283      $data['actions'][] = $action;        if ($content[$pos2 - 2] == 0x0d && $content[$pos2 - 1] == 0x0a) {
284            $pos2 -= 2;
285          }
286          elseif ($content[$pos2 - 1] == 0x0a) {
287            $pos2 -= 1;
288          }
289    
290          $textsection = substr($content, $pos + strlen($searchstart) + 2, $pos2 - $pos - strlen($searchstart) - 1);
291          $data = @gzuncompress($textsection);
292          $pdfText .= legislature_pdf_extract_text($data);
293          $startpos = $pos2 + strlen($searchend) - 1;
294        }
295    }    }
296    
297    /*    return preg_replace('/(\s)+/', ' ', $pdfText);
298    // We re-parse the page here to get the link(s) associated with a given entry  }
299    // Each the link goes to a senate journal in pdf form. The links contain  
300    // spaces which are escaped for parsing  /**
301    // todo merge into upper loop   * Used for pdf conversion.
302    // todo go ahead and get votes   *
303    $parser = new HtmlParser($html_page);   * @param $psData
304    $links = get_all_links_after_hr_tag($parser);   *   Uncompressed data.
305    $links = create_escaped_links($links);   * @return
306     *   Plain text of pdf.
307    // Append link(s) to given bill history entry   */
308    for ( $i = 0; $i < count($bill_history); $i += 1) {  function legislature_pdf_extract_text($psData){
309      array_push($bill_history[$i]["journal_links"], $links[$i]);    if (!is_string($psData)) {
310        return '';
311    }    }
312    */  
313      $text = '';
314    
315      // Handle brackets in the text stream that could be mistaken for the end of a
316      // text field. I'm sure you can do this as part of the regular expression,
317      // but my skills aren't good enough yet.
318      $psData = str_replace('\)', '##ENDBRACKET##', $psData);
319      $psData = str_replace('\]', '##ENDSBRACKET##', $psData);
320    
321      preg_match_all('/(T[wdcm*])[\s]*(\[([^\]]*)\]|\(([^\)]*)\))[\s]*Tj/si', $psData, $matches);
322      for ($i = 0; $i < sizeof($matches[0]); $i += 1) {
323        if ($matches[3][$i] != '') {
324          // Run another match over the contents.
325          preg_match_all('/\(([^)]*)\)/si', $matches[3][$i], $subMatches);
326          foreach ($subMatches[1] as $subMatch) {
327            $text .= $subMatch;
328          }
329        } else if ($matches[4][$i] != '') {
330          $text .= ($matches[1][$i] == 'Tc' ? ' ' : '') . $matches[4][$i];
331        }
332      }
333    
334      // Translate special characters and put back brackets.
335      $trans = array(
336        '...'                => '…',
337        '\205'                => '…',
338        '\221'                => chr(145),
339        '\222'                => chr(146),
340        '\223'                => chr(147),
341        '\224'                => chr(148),
342        '\226'                => '-',
343        '\267'                => '•',
344        '\('                => '(',
345        '\['                => '[',
346        '##ENDBRACKET##'    => ')',
347        '##ENDSBRACKET##'    => ']',
348        chr(133)            => '-',
349        chr(141)            => chr(147),
350        chr(142)            => chr(148),
351        chr(143)            => chr(145),
352        chr(144)            => chr(146),
353      );
354      return strtr($text, $trans);
355  }  }

Legend:
Removed from v.1.7  
changed lines
  Added in v.1.8

  ViewVC Help
Powered by ViewVC 1.1.2