Rinza_IE
[ class tree: Rinza_IE ] [ index: Rinza_IE ] [ all elements ]

Source for file Spider.php

Documentation is available at Spider.php

  1. <?php
  2.  
  3. /**
  4.  * Rinza/IE/Spider.php
  5.  * 
  6.  * <p>指定URLを起点として最初に得られる有意コンテンツを取得する手段を提供する<br/>
  7.  * クラスを定義しています。</p>
  8.  * @author Y. Kurei
  9.  * @copyright Copyright (c) 2006-2007, Nihon Unisys, Ltd.
  10.  * @version 1.1.4
  11.  * @package Rinza_IE
  12.  * @see Rinza_IE_Util
  13.  * @see Rinza_IE_Scraper
  14.  * @filesource
  15.  * @license http://www.tyzoh.jp/rinza/licenses/LICENSE-1.0.txt Rinza Public License
  16.  * @example examples/sample.php
  17.  */
  18.  
  19.  
  20. require_once 'HTTP/Request.php';
  21. require_once 'Rinza/IE/Util.php'
  22. require_once 'Rinza/IE/Scraper.php';
  23.  
  24.  
  25. /**
  26.  * Rinza_IE_Spider Class
  27.  * 
  28.  * <p>指定URLを起点として最初に得られる有意コンテンツを取得する手段を提供します。<br/>
  29.  * 指定URLに対して
  30.  * <ul>
  31.  *   <li>Webサーバにおいて,別のURLにリダイレクトするようにしている</li>
  32.  *   <li>対応するWebページにおいて,ユーザエージェントの受付言語に応じてURLを切り替えるようにしている</li>
  33.  *   <li>対応するWebページにおいて,自動的に別のWebページに切り替えるようにしている</li>
  34.  *   <li>対応するWebページにおいて,フレームで構成するようにしている</li>
  35.  * </ul>
  36.  * のような場合であっても,それを意識させることなく,意味のある内容を含むWebコンテンツを取得することができます。</p>
  37.  * <p>使用例:
  38.  * <code>
  39.  * $s = & new Rinza_IE_Spider($url);
  40.  * print_r($s->getSignificantContent());
  41.  * </code></p>
  42.  * @author Y. Kurei
  43.  * @package Rinza_IE
  44.  */
  45. class Rinza_IE_Spider extends HTTP_Request {
  46.     
  47.     /**
  48.      * オブジェクト生成時に指定されたコンテンツ取得先URL
  49.      * @access  private
  50.      * @var  string 
  51.      */
  52.     var $_origin_url NULL;
  53.     /**
  54.      * オブジェクト生成後に最も新しく取得したコンテンツのURL
  55.      * @access  private
  56.      * @var  string 
  57.      */
  58.     var $_target_url NULL;
  59.     /**
  60.      * オブジェクト生成後に最も新しくコンテンツを取得した際のHTTPレスポンスヘッダ
  61.      * @access  private
  62.      * @var  array 
  63.      */
  64.     var $_response_header array();
  65.     /**
  66.      * オブジェクト生成後に最も新しくコンテンツを取得した際のクッキー
  67.      * @access  private
  68.      * @var  array 
  69.      */
  70.     var $_cokkies array();
  71.     /**
  72.      * オブジェクト生成時に指定された取得希望コンテンツの言語
  73.      * @access  private
  74.      * @var  string 
  75.      */
  76.     var $_accept_lang NULL;
  77.     /**
  78.      * オブジェクト生成時に指定されたユーザエージェント
  79.      * @access  private
  80.      * @var  string 
  81.      */
  82.     var $_user_agent NULL;
  83.     /**
  84.      * オブジェクト生成時に指定されたキャッシュ指示子の追加可否
  85.      * @access  private
  86.      * @var  boolean 
  87.      * 
  88.      */
  89.     var $_cache_ctrl TRUE;
  90.     /**
  91.      * オブジェクト生成後に有意コンテンツを取得するために得られたコンテンツの履歴
  92.      * @access  private
  93.      * @var  array 
  94.      */
  95.     var $_history array();
  96.     /**
  97.      * 取得コンテンツ履歴配列(_history)のインデックス
  98.      * @access  private
  99.      * @var  integer 
  100.      */
  101.     var $_h_idx = -1;
  102.     /**
  103.      * 取得済み有意コンテンツ格納配列
  104.      * @access  private
  105.      * @var  array 
  106.      */
  107.     var $_significant array();
  108.     /**
  109.      * フレームページURLがCGIである場合の親ページURL
  110.      * @access  private
  111.      * @var  string 
  112.      */
  113.     var $_cgi_frame_url NULL;
  114.     /**
  115.      * <noscript>...<noscript>内指定リンク情報(href=)で指定されたURL(重複アクセス防止のための確認用)
  116.      * @access  private
  117.      * @var array 
  118.      */
  119.     var $_noscript_href array();
  120.     /**
  121.      * <noscript>...<noscript>内指定リンク情報(href=)で指定されたURL格納配列のインデックス
  122.      * @access  private
  123.      * @var integer 
  124.      */
  125.      var $_nh_idx = -1;
  126.     /**
  127.      * <noscript>...<noscript>内指定リンク情報(href=)で指定されたコンテンツ処理実行可否フラグ
  128.      * @access  private
  129.      * @var boolean 
  130.      */
  131.     var $_nh_flg FALSE;
  132.     /** <body onLoad="function" >内指定関数をジャンプ先URL取得対象とするか否かの判定フラグ
  133.      * @access  private
  134.      * @var boolean 
  135.      */
  136.     var $_bo_flg FALSE;
  137.     
  138.     /**
  139.      * コンストラクタ
  140.      * 
  141.      * オブジェクトの設定
  142.      * @param  string  $pURL     コンテンツ取得先URL(既定値は空文字列)
  143.      * @param  array   $pArgs    HTTP_Request用パラメタ(既定値は空配列・・・HTTP_Requestの既定値に従う)
  144.      * @param  string  $pAlang   取得希望コンテンツの言語(既定値は'ja')
  145.      * @param  stirng  $pUagent  コンテンツリクエスト時のユーザエージェント(既定値はNULL・・・HTTP_Requestの既定値に従う)
  146.      * @param  boolean  $pIgnoreNoScript  <noscript>...</noscript>内リンク情報を無視する(TRUE)か否(FALSE)か(既定値:FALSE)
  147.      * @param  boolean  $pCacheCtrl  キャッシュ指示子("Pragma: no-cache"と"Cache-Control: no-cache")を追加する(TRUE)か否(FALSE)か(既定値:TRUE)
  148.      * @access  public
  149.      */
  150.     function __construct($pURL ''$pArgs array()$pAlang 'ja'$pUagent NULL$pIgnoreNoScript=FALSE$pCacheCtrl TRUE{
  151.         
  152.         $this->_origin_url $pURL;
  153.         $this->_accept_lang $pAlang;
  154.         $this->_user_agent $pUagent;
  155.         $this->_cache_ctrl $pCacheCtrl;
  156.         parent::HTTP_Request($pURL$pArgs);
  157.         $this->_nh_flg $pIgnoreNoScript;
  158.         
  159.     }
  160.     
  161.     
  162.     /**
  163.      * 現在のコンテンツ取得先URLを起点に有意なコンテンツを取得する
  164.      * 
  165.      * <ul>
  166.      *   <li>Webサーバによるリダイレクト先コンテンツ</li>
  167.      *   <li>ユーザエージェント受付言語に則したJavaScriptによる切り替え先コンテンツ<br/>
  168.      *         navigator.language.indexof(取得希望コンテンツの言語)<br/>
  169.      *         location.href</li>
  170.      *   <li>自動的に切り替えられる先のコンテンツ<br/>
  171.      *         <meta http-equiv="refresh"></li>
  172.      *   <li>フレームで構成される場合の各フレームソースのすべてのコンテンツ</li>
  173.      * </ul>
  174.      * @return  array  以下のキーをもつ有意コンテンツ情報配列(有意コンテンツを得られなかった場合は空配列を返す)
  175.      *  <ul>
  176.      *    <li>url              -  有意コンテンツ取得先URL</li>
  177.      *    <li>response_header  -  有意コンテンツ取得時のHTTPレスポンスヘッダ値の配列
  178.      *    <li>content          -  取得した有意コンテンツ文字列</li>
  179.      *  </ul>
  180.      * @access  public
  181.      */
  182.      function getSignificantContent({
  183.         
  184.         $bodyArray array();
  185.         $body_idx 0;
  186.         
  187.         // 既に有意コンテンツ取得済みの場合
  188.         if (count($this->_significant0{
  189.             return $this->_significant;
  190.         }
  191.         
  192.         // 有意コンテンツ未取得の場合
  193.         $body FALSE;
  194.         $this->_allowRedirects TRUE// リダイレクトを受け付ける
  195.         $this->addHeader('Accept-Language'$this->_accept_lang)// 受け入れ言語を指定する
  196.         if (!is_null($this->_user_agent)) // 指定ユーザエージェントを設定
  197.             $this->addHeader('User-Agent'$this->_user_agent);
  198.         }
  199.         if (!is_null($this->_cache_ctrl)) // キャッシュ指示子の追加
  200.             $this->addHeader('Pragma''no-cache');
  201.             $this->addHeader('Cache-Control''no-cache');
  202.         }
  203.  
  204.         if (!PEAR::isError($this->sendRequest())) // コンテンツ取得リクエスト発行
  205.             $code $this->getResponseCode();
  206.             if (399 $code && $code 600// レスポンスステータス判定
  207.             else {
  208.                 $this->_response_header $this->getResponseHeader();
  209.                 $this->_cookies $this->getResponseCookies();
  210.                 $body $this->getResponseBody();
  211.                 $this->_target_url $this->_url->url// _url : parent::_url
  212.                 $this->_response_header $this->getResponseHeader();
  213.             }
  214.         }
  215.         
  216.         // 有意コンテンツ取得処理
  217.         if ($body === FALSE{
  218.         else {
  219.             // コンテンツ取得履歴の設定
  220.             $encoding 'UTF-8';
  221.             $scraper =new Rinza_IE_Scraper($body$encoding);
  222.             $body $scraper->getNormarizedString();
  223.             $this->_h_idx $this->_h_idx 1
  224.             $this->_history[$this->_h_idx]['url'$this->_target_url;
  225.             $this->_history[$this->_h_idx]['response_header'$this->_response_header;
  226.             $this->_history[$this->_h_idx]['content'$body;
  227.             // パターン別有意コンテンツ取得
  228.             $script_ctrl FALSE;
  229.             if (($lang_pos mb_strpos(strtolower($body)'navigator.language.indexof(\'' $this->_accept_lang '\')'0$encoding)) !== FALSE// Java Script(Accept-Language)
  230.                 $block_info $scraper->getPartialBlock('location.href="''"'$lang_pos);
  231.                 if (!is_null($block_info[0])) {
  232.                     $url Rinza_IE_Util::getAbsoluteURL($this->_target_url$block_info[0]);
  233.                     $this->setURL($url)// parent::setURL()
  234.                     $bodyArray $this->getSignificantContent();
  235.                     $script_ctrl TRUE;
  236.                 }
  237.             elseif (($refresh_pos mb_strpos(strtolower($body)'function refresh()'0$encoding)) !== FALSE// function refresh()
  238.                 if (($location_pos mb_strpos(strtolower($body)'location.href'$refresh_pos$encoding)) !== FALSE{
  239.                     $block_info $scraper->getPartialBlock('"''"'$location_pos);
  240.                     if (!is_null($block_info[0])) {
  241.                         $url Rinza_IE_Util::getAbsoluteURL($this->_target_url$block_info[0]);
  242.                         $this->setURL($url)// parent::setURL()
  243.                         $bodyArray $this->getSignificantContent();
  244.                         $script_ctrl TRUE;
  245.                     }
  246.                 }
  247.             elseif ((($refresh_pos mb_strpos(strtolower($body)'http-equiv="refresh"'0$encoding)) !== FALSE&&
  248.                        (($ns_epos mb_strpos(strtolower($body)'</noscript>'$refresh_pos$encoding)) === FALSE ||
  249.                         ($ns_epos !== FALSE && ($ns_spos mb_strpos(strtolower($body)'<noscript>'$refresh_pos$encoding)) !== FALSE && $ns_spos $ns_epos))) // http-equiv="Refresh"
  250.                 $block_info $scraper->getPartialBlock('url=''"'$refresh_pos);
  251.                 if (!is_null($block_info[0])) {
  252.                     $url Rinza_IE_Util::getAbsoluteURL($this->_target_urltrim($block_info[0]"'"));
  253.                     $this->setURL($url)// parent::setURL()
  254.                     $bodyArray $this->getSignificantContent();
  255.                     $script_ctrl TRUE;
  256.                 }
  257.             elseif ((($refresh_pos mb_strpos(strtolower($body)'http-equiv=\'refresh\''0$encoding)) !== FALSE&&
  258.                        (($ns_epos mb_strpos(strtolower($body)'</noscript>'$refresh_pos$encoding)) === FALSE ||
  259.                         ($ns_epos !== FALSE && ($ns_spos mb_strpos(strtolower($body)'<noscript>'$refresh_pos$encoding)) !== FALSE && $ns_spos $ns_epos))) // http-equiv='Refresh'
  260.                 $block_info $scraper->getPartialBlock('URL=''\''$refresh_pos);
  261.                 if (!is_null($block_info[0])) {
  262.                     $url Rinza_IE_Util::getAbsoluteURL($this->_target_url$block_info[0]);
  263.                     $this->setURL($url)// parent::setURL()
  264.                     $bodyArray $this->getSignificantContent();
  265.                     $script_ctrl TRUE;
  266.                 }
  267.             elseif (($refresh_pos mb_strpos(strtolower($body)'http-equiv=refresh'0$encoding)) !== FALSE &&
  268.                        (($ns_epos mb_strpos(strtolower($body)'</noscript>'$refresh_pos$encoding)) === FALSE ||
  269.                         ($ns_epos !== FALSE && ($ns_spos mb_strpos(strtolower($body)'<noscript>'$refresh_pos$encoding)) !== FALSE && $ns_spos $ns_epos))) // http-equiv=Refresh
  270.                 $block_info $scraper->getPartialBlock('URL=''>'$refresh_pos);
  271.                 if (!is_null($block_info[0])) {
  272.                     $url Rinza_IE_Util::getAbsoluteURL($this->_target_urltrim(trim($block_info[0])'",\''));
  273.                     $this->setURL($url)// parent::setURL()
  274.                     $bodyArray $this->getSignificantContent();
  275.                     $script_ctrl TRUE;
  276.                 }
  277.             elseif (!$this->_bo_flg{
  278.                 // <body onLoad="location_function">
  279.                 $body_tag $scraper->getPartialBlock('<body ''>'0);
  280.                 if (!is_null($body_tag[0])) {
  281.                     if (($onLoad_pos mb_strpos(strtolower($body_tag[0])'onload'0$encoding)) !== FALSE{
  282.                         $this->_bo_flg TRUE;
  283.                         $scraper->setString($body_tag[0]);
  284.                         $block_info $scraper->getPartialBlock('"''"'$onLoad_pos)// 関数名抽出
  285.                         if (is_null($block_info[0])) {
  286.                             $block_info $scraper->getPartialBlock('\'''\''$onLoad_pos)// 関数名抽出
  287.                         }
  288.                         if (!is_null($block_info[0])) {
  289.                             if (($setTimeout_pos mb_strpos(strtolower($body_tag[0])'settimeout('0$encoding)) !== FALSE{
  290.                                 $block_info $scraper->getPartialBlock('\'''\''$setTimeout_pos);
  291.                             }
  292.                             $scraper->setString($body);
  293.                             if (($location_func_pos mb_strpos(strtolower($body)'function ' strtolower($block_info[0])0$encoding)) !== FALSE{
  294.                                 $block_info $scraper->getPartialBlock('{''}'$location_func_pos);
  295.                                 if (!is_null($block_info[0])) {
  296.                                     if (($location_pos =mb_strpos(strtolower($block_info[0])'location.href'0$encoding)) !== FALSE{
  297.                                         $scraper->setString($block_info[0]);
  298.                                         $block_info $scraper->getPartialBlock('"''"'$location_pos);
  299.                                         if ($block_info[0]{
  300.                                             $url Rinza_IE_Util::getAbsoluteURL($this->_target_url$block_info[0]);
  301.                                             $this->setURL($url)// parent::setURL()
  302.                                             $bodyArray $this->getSignificantContent();
  303.                                             $script_ctrl TRUE;
  304.                                         }
  305.                                     }
  306.                                 }
  307.                             }
  308.                         }
  309.                     }
  310.                 }
  311.             }
  312.             
  313.             $frame_ctrl FALSE;
  314.             $true_fcontnet $scraper->getRemoveHTMLComments();
  315.             if (!$script_ctrl && ($frame_pos mb_strpos(strtolower($true_fcontnet)'<frame '0$encoding)) !== FALSE// フレーム使用ページ
  316.                 $frame_ctrl TRUE;
  317.                 $frame_body NULL;
  318.                 $copy_target_url $this->_target_url;
  319.                 $scraper->setString($true_fcontnet);
  320.                 $offset_pos 0;
  321.                 $block_info $scraper->getPartialBlock('<frame ''>'$offset_pos);
  322.                 if (!is_null($block_info[0])) {
  323.                     $frame_flg TRUE;
  324.                     $offset_pos $block_info[1];
  325.                     $frame_body array();
  326.                     $frame_idx 0;
  327.                 else {
  328.                     $frame_flg =FALSE;
  329.                 }
  330.                 while ($frame_flg{
  331.                     $scraper->setString($block_info[0]);
  332.                     $block_info $scraper->getPartialBlock('src="''"'0);
  333.                     if (!is_null($block_info[0])) {
  334.                         $url Rinza_IE_Util::getAbsoluteURL($copy_target_url$block_info[0]);
  335.                         if (preg_match("/\?/"basename($url))) // フレームURLがCGI呼び出しか否かを検査
  336.                             $this->_cgi_frame_url $copy_target_url;
  337.                         }
  338.                         $this->setURL($url)// parent::setURL()
  339.                         $this->_significant array();
  340.                         $frame_body[$frame_idx$this->getSignificantContent();
  341.                         $frame_idx++;
  342.                     }
  343.                     $scraper->setString($true_fcontnet);
  344.                     $block_info $scraper->getPartialBlock('<frame ''>'$offset_pos);
  345.                     if (!is_null($block_info[0])) {
  346.                         $offset_pos $block_info[1];
  347.                     else {
  348.                         $frame_flg =FALSE;
  349.                     }
  350.                 }
  351.                 if (($frame_cnt count($frame_body)) 0{
  352.                     for ($i 0$i $frame_cnt$i++{
  353.                         for ($j 0$j count($frame_body[$i])$j++{
  354.                             $bodyArray[$body_idx$frame_body[$i][$j];
  355.                             $body_idx++;
  356.                         }
  357.                     }
  358.                 }
  359.             }
  360.             
  361.             if (!$script_ctrl && !$frame_ctrl{
  362.                 if (is_null($this->_cgi_frame_url)) // 「フレームページではない」or 「フレームURLがCGI呼び出し以外」の場合
  363.                     $bodyArray[$body_idxarray(
  364.                                                 'url'=>$this->_target_url,
  365.                                                 'response_header'=>$this->_response_header,
  366.                                                 'content'=>$body
  367.                                             );
  368.                     $body_idx++;
  369.                 else // フレームURLがCGI呼び出しの場合
  370.                     $bodyArray[$body_idxarray(
  371.                                                 'url'=>$this->_cgi_frame_url,
  372.                                                 'response_header'=>$this->_response_header,
  373.                                                 'content'=>$body
  374.                                             );
  375.                     $body_idx++;
  376.                     $this->_cgi_frame_url NULL;
  377.                 }
  378.                 
  379.             }
  380.             
  381.             // <noscript><meta http-eqiuv="refresh" content="n;URL=xxxxx"></noscript> あるいは
  382.             // <noscript><xxx href="xxxxxx"></noscript>使用 コンテンツ処理(リンク先URLのコンテンツ追加取得)
  383.             if (!$this->_nh_flg{
  384.                 $body_idx count($bodyArray);
  385.                 for ($i 0$i $body_idx$i++{
  386.                     $nsurl $bodyArray[$i]['url'];
  387.                     $this->_nh_idx++;
  388.                     $this->_noscript_href[$this->_nh_idx$nsurl;
  389.                     $nsresponse_header $bodyArray[$i]['response_header'];
  390.                     $nscontent $bodyArray[$i]['content'];
  391.                     if (($ns_pos mb_strpos(strtolower($nscontent)'<noscript>'0$encoding)) !== FALSE{
  392.                         $ns_body NULL;
  393.                         $scraper->setString($nscontent);
  394.                         $offset_pos 0;
  395.                         $block_info $scraper->getPartialBlock('<noscript>''</noscript>'$offset_pos);
  396.                         if (!is_null($block_info[0])) {
  397.                             $ns_flg TRUE;
  398.                             $offset_pos $block_info[1];
  399.                             $ns_body array();
  400.                             $ns_idx 0;
  401.                         else {
  402.                             $ns_flg FALSE;
  403.                         }
  404.                         while ($ns_flg{
  405.                             $scraper->setString($block_info[0]);
  406.                             $block_info $scraper->getPartialBlock('http-equiv=''>'0);
  407.                             if (!is_null($block_info[0])) {
  408.                                 $block_info trim($scraper->getPartialBlock('URL="'NULL0),"',\"");
  409.                                 if (!is_null($block_info[0])) {
  410.                                     $url Rinza_IE_Util::getAbsoluteURL($nsurl$block_info[0]);
  411.                                     if ($this->_nh_idx 0{
  412.                                         $this->_nh_flg TRUE;
  413.                                         $this->_nh_idx++;
  414.                                         $this->_noscript_href[$this->_nh_idx$url;
  415.                                         $this->setURL($url)// parent::setURL()
  416.                                         $this->_significant array();
  417.                                         $tmp_body $this->getSignificantContent();
  418.                                         if (!empty($tmp_body)) {
  419.                                             $ns_body[$ns_idx$tmp_body;
  420.                                             $ns_idx++;
  421.                                         }
  422.                                     else {
  423.                                         if (!array_search($url$this->_noscript_href)) {
  424.                                             $this->_nh_flg TRUE;
  425.                                             $this->_nh_idx++;
  426.                                             $this->_noscript_href[$this->_nh_idx$url;
  427.                                             $this->setURL($url)// parent::setURL()
  428.                                             $this->_significant array();
  429.                                             $tmp_body $this->getSignificantContent();
  430.                                             if (!empty($tmp_body)) {
  431.                                                 $ns_body[$ns_idx$tmp_body;
  432.                                                 $ns_idx++;
  433.                                             }
  434.                                         }
  435.                                     }
  436.                                 }
  437.                             else {
  438.                                 $block_info $scraper->getPartialBlock('href="''"'0);
  439.                                 if (!is_null($block_info[0])) {
  440.                                     if (preg_match("/^#/"$block_info[0])) {
  441.                                     else {
  442.                                         $url Rinza_IE_Util::getAbsoluteURL($nsurl$block_info[0]);
  443.                                         if (strcmp($nsurl$url!= 0{
  444.                                             if ($this->_nh_idx 0{
  445.                                                 $this->_nh_flg TRUE;
  446.                                                 $this->_nh_idx++;
  447.                                                 $this->_noscript_href[$this->_nh_idx$url;
  448.                                                 $this->setURL($url)// parent::setURL()
  449.                                                 $this->_significant array();
  450.                                                 $tmp_body $this->getSignificantContent();
  451.                                                 if (!empty($tmp_body)) {
  452.                                                     $ns_body[$ns_idx$tmp_body;
  453.                                                     $ns_idx++;
  454.                                                 }
  455.                                             else {
  456.                                                 if (!array_search($url$this->_noscript_href)) {
  457.                                                     $this->_nh_flg TRUE;
  458.                                                     $this->_nh_idx++;
  459.                                                     $this->_noscript_href[$this->_nh_idx$url;
  460.                                                     $this->setURL($url)// parent::setURL()
  461.                                                     $this->_significant array();
  462.                                                     $tmp_body $this->getSignificantContent();
  463.                                                     if (!empty($tmp_body)) {
  464.                                                         $ns_body[$ns_idx$tmp_body;
  465.                                                         $ns_idx++;
  466.                                                     }
  467.                                                 }
  468.                                             }
  469.                                         }
  470.                                     }
  471.                                 }
  472.                             }
  473.                             $scraper->setString($nscontent);
  474.                             $block_info $scraper->getPartialBlock('<noscript>''</noscript>'$offset_pos);
  475.                             if (!is_null($block_info[0])) {
  476.                                 $offset_pos $block_info[1];
  477.                             else {
  478.                                 $ns_flg =FALSE;
  479.                             }
  480.                         }
  481.                         if (($ns_cnt count($ns_body)) 0{
  482.                             for ($j 0$j $ns_cnt$j++{
  483.                                 for ($k 0$k count($ns_body[$j])$k++{
  484.                                     $bodyArray[$body_idx$ns_body[$j][$k];
  485.                                     $body_idx++;
  486.                                 }
  487.                             }
  488.                         }
  489.                     }
  490.                 }
  491.             }
  492.  
  493.             // <iframe>使用コンテンツ処理(iframリンク先URLのコンテンツ追加取得)
  494.             $body_idx count($bodyArray);
  495.             for ($i 0$i $body_idx$i++{
  496.                 $iurl $bodyArray[$i]['url'];
  497.                 $iresponse_header $bodyArray[$i]['response_header'];
  498.                 $icontent $bodyArray[$i]['content'];
  499.                 $scraper->setString($icontent);
  500.                 $true_icontent $scraper->getRemoveHTMLComments();
  501.                 if (($iframe_pos mb_strpos(strtolower($true_icontent)'<iframe '0$encoding)) !== FALSE{
  502.                     $iframe_body NULL;
  503.                     $scraper->setString($true_icontent);
  504.                     $offset_pos 0;
  505.                     $block_info $scraper->getPartialBlock('<iframe ''>'$offset_pos);
  506.                     if (!is_null($block_info[0])) {
  507.                         $iframe_flg TRUE;
  508.                         $offset_pos $block_info[1];
  509.                         $iframe_body array();
  510.                         $iframe_idx 0;
  511.                     else {
  512.                         $iframe_flg FALSE;
  513.                     }
  514.                     while ($iframe_flg{
  515.                         $scraper->setString($block_info[0]);
  516.                         $block_info $scraper->getPartialBlock('src="''"'0);
  517.                         if (!is_null($block_info[0])) {
  518.                             $url Rinza_IE_Util::getAbsoluteURL($iurl$block_info[0]);
  519.                             $this->setURL($url)// parent::setURL()
  520.                             $this->_significant array();
  521.                             $tmp_body $this->getSignificantContent();
  522.                             if (!empty($tmp_body)) {
  523.                                 $iframe_body[$iframe_idx$tmp_body;
  524.                                 $iframe_idx++;
  525.                             else {
  526.                             }
  527.                         }
  528.                         $scraper->setString($true_icontent);
  529.                         $block_info $scraper->getPartialBlock('<iframe ''>'$offset_pos);
  530.                         if (!is_null($block_info[0])) {
  531.                             $offset_pos $block_info[1];
  532.                         else {
  533.                             $iframe_flg =FALSE;
  534.                         }
  535.                     }
  536.                     if (($iframe_cnt count($iframe_body)) 0{
  537.                         for ($j 0$j $iframe_cnt$j++{
  538.                             for ($k 0$k count($iframe_body[$j])$k++{
  539.                                 $bodyArray[$body_idx$iframe_body[$j][$k];
  540.                                 $body_idx++;
  541.                             }
  542.                         }
  543.                     }
  544.                 }
  545.             }
  546.         }
  547.         unset($scraper);
  548.  
  549.         return $this->_significant $bodyArray;
  550.         
  551.     }
  552.     
  553.     
  554.     /**
  555.      * 最も新しく取得したコンテンツのURLを得る
  556.      * 
  557.      * @return  string  最も新しく取得したコンテンツのURL
  558.      * @access  public
  559.      */
  560.     function getTargetURL({
  561.         
  562.         return $this->_target_url;
  563.         
  564.     }
  565.     
  566.     
  567.     /**
  568.      * 取得コンテンツ履歴を得る
  569.      * 
  570.      * @return  array  以下のキーをもつ取得コンテンツ履歴配列
  571.      *  <ul>
  572.      *    <li>url              -  コンテンツ取得先URL</li>
  573.      *    <li>response_header  -  コンテンツ取得時のHTTPレスポンスヘッダ値の配列</li>
  574.      *    <li>content          -  取得コンテンツ文字列</li>
  575.      *  </ul>
  576.      * @access  public
  577.      */
  578.     public function getContentHistories({
  579.         
  580.         return $this->_history;
  581.         
  582.     }
  583.     
  584.     
  585. }
  586.  
  587.  
  588. /** 
  589.  * This Program is distributed under version 1.0 of the Rinza Public
  590.  * License Agreement, that is bundled with this package in the file
  591.  * LICENSE, and is available through the website at the following URL:
  592.  * http://www.tyzoh.jp/rinza/licenses/LICENSE-1.0.txt.
  593.  * 
  594.  * This is the Original Program.
  595.  * The Initial Developer of the Original Program is Nihon Unisys, Ltd.
  596.  * The Original Program is copyrighted (C) 2006-2007 by Nihon Unisys, Ltd. with
  597.  * all rights reserved.
  598.  * There is NO WARRANTY OF ANY KIND by the Initial Developer of the
  599.  * Original Program.
  600.  */
  601.  
  602. ?>

Documentation generated on Fri, 09 Nov 2007 14:01:44 +0900 by phpDocumentor 1.4.0