Server : nginx/1.22.1
System : Linux iZwz9daxib3w3i063fw434Z 3.10.0-1127.19.1.el7.x86_64 #1 SMP Tue Aug 25 17:23:54 UTC 2020 x86_64
User : www ( 1000)
PHP Version : 5.6.40
Disable Function : passthru,exec,system,putenv,chroot,chgrp,chown,shell_exec,popen,proc_open,pcntl_exec,ini_alter,ini_restore,dl,openlog,syslog,readlink,symlink,popepassthru,pcntl_alarm,pcntl_fork,pcntl_waitpid,pcntl_wait,pcntl_wifexited,pcntl_wifstopped,pcntl_wifsignaled,pcntl_wifcontinued,pcntl_wexitstatus,pcntl_wtermsig,pcntl_wstopsig,pcntl_signal,pcntl_signal_dispatch,pcntl_get_last_error,pcntl_strerror,pcntl_sigprocmask,pcntl_sigwaitinfo,pcntl_sigtimedwait,pcntl_exec,pcntl_getpriority,pcntl_setpriority,imap_open,apache_setenv
Directory :  /www/wwwroot/www.jkmold.com/phpcms/libs/classes/
Upload File :
Current Directory [ Writeable ] Root Directory [ Writeable ]


Current File : /www/wwwroot/www.jkmold.com/phpcms/libs/classes/segment.class.php
<?php

/**

 * 中文分词操作类

 * @author wangcanjia

 *

 */

class segment {

	public $rank_dic = array();

	public $one_name_dic = array();

	public $two_name_dic = array();

	public $new_word = array();

	public $source_string = '';

	public $result_string = '';

	public $split_char = ' '; //分隔符

	public $SplitLen = 4; //保留词长度

	public $especial_char = "和|的|是";

	public $new_word_limit = "在|的|与|或|就|你|我|他|她|有|了|是|其|能|对|地";

	public $common_unit = "年|月|日|时|分|秒|点|元|百|千|万|亿|位|辆";

	public $cn_number = "0|1|2|3|4|5|6|7|8|9|+|-|%|.|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s |t|u|v|w|x|y|z|A|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z";

	public $cn_sg_num = "一|二|三|四|五|六|七|八|九|十|百|千|万|亿|数";

	public $max_len = 13; //词典最大 7 中文字,这里的数值为字节数组的最大索引

	public $min_len = 3;  //最小 2 中文字,这里的数值为字节数组的最大索引

	public $cn_two_name = "端木 南宫 谯笪 轩辕 令狐 钟离 闾丘 长孙 鲜于 宇文 司徒 司空 上官 欧阳 公孙 西门 东门 左丘 东郭 呼延 慕容 司马 夏侯 诸葛 东方 赫连 皇甫 尉迟 申屠";

	public $cn_one_name = "赵钱孙李周吴郑王冯陈褚卫蒋沈韩杨朱秦尤许何吕施张孔曹严华金魏陶姜戚谢邹喻柏水窦章云苏潘葛奚范彭郎鲁韦昌马苗凤花方俞任袁柳酆鲍史唐费廉岑薛雷贺倪汤滕殷罗毕郝邬安常乐于时傅皮卡齐康伍余元卜顾孟平黄穆萧尹姚邵堪汪祁毛禹狄米贝明臧计伏成戴谈宋茅庞熊纪舒屈项祝董粱杜阮蓝闵席季麻强贾路娄危江童颜郭梅盛林刁钟徐邱骆高夏蔡田樊胡凌霍虞万支柯咎管卢莫经房裘缪干解应宗宣丁贲邓郁单杭洪包诸左石崔吉钮龚程嵇邢滑裴陆荣翁荀羊於惠甄魏加封芮羿储靳汲邴糜松井段富巫乌焦巴弓牧隗谷车侯宓蓬全郗班仰秋仲伊宫宁仇栾暴甘钭厉戎祖武符刘姜詹束龙叶幸司韶郜黎蓟薄印宿白怀蒲台从鄂索咸籍赖卓蔺屠蒙池乔阴郁胥能苍双闻莘党翟谭贡劳逄姬申扶堵冉宰郦雍郤璩桑桂濮牛寿通边扈燕冀郏浦尚农温别庄晏柴翟阎充慕连茹习宦艾鱼容向古易慎戈廖庚终暨居衡步都耿满弘匡国文寇广禄阙东殴殳沃利蔚越夔隆师巩厍聂晁勾敖融冷訾辛阚那简饶空曾沙须丰巢关蒯相查后江游竺";

  

	function __construct($loaddic=true) {

  	if($loaddic) {

  	  for($i=0;$i<strlen($this->cn_one_name);$i++){

  		  $this->one_name_dic[$this->cn_one_name[$i].$this->cn_one_name[$i+1]] = 1;

  		  $i++;

  	  }

  	  $twoname = explode(" ",$this->cn_two_name);

  	  foreach($twoname as $n){ $this->two_name_dic[$n] = 1; }

  	  unset($twoname);

  	  unset($this->cn_two_name);

  	  unset($this->cn_one_name);

  	  $dicfile = PC_PATH.'libs'.DIRECTORY_SEPARATOR.'data'.DIRECTORY_SEPARATOR.'dict'.DIRECTORY_SEPARATOR.'dict.csv';

  	  $fp = fopen($dicfile,'r');

  	  while($line = fgets($fp,64)){

  		  $ws = explode(' ',$line);

  		  $this->rank_dic[strlen($ws[0])][$ws[0]] = $ws[1];

  	  }

  	  fclose($fp);

    }

  }



  function clear() {

  	unset($this->rank_dic);

  }

  function get_source($str) {

  	if(CHARSET == 'utf-8') $str = iconv('utf-8','gbk',$str);

  	$this->source_string = $str;

  	$this->result_string = '';

  }

  function simple_split($str) {

  	$this->source_string = $this->revise_string($str);

  	return $this->source_string;

  }

  function split_result($str='',$try_num_name=true,$try_diff=true) {

  	$str = trim($str);

  	if($str!='') $this->get_source($str);

  	else return '';

  	$this->source_string = preg_replace('/ {1,}/',' ',$this->revise_string($this->source_string));

  	$spwords = explode(' ',$this->source_string);

  	$spLen = count($spwords) - 1;

  	$spc = $this->split_char;

  	for($i=$spLen;$i>=0;$i--){

  		if(ord($spwords[$i][0])<33) continue;

  		else if(!isset($spwords[$i][$this->min_len])) $this->result_string = $spwords[$i].$spc.$this->result_string;

  		else if(ord($spwords[$i][0])<0x81){

  			$this->result_string = $spwords[$i].$spc.$this->result_string;

  		} else {

  		  $this->result_string = $this->split_mm($spwords[$i],$try_num_name,$try_diff).$spc.$this->result_string;

  	  }

  	}

  	if(CHARSET=='utf-8') $okstr = iconv('gbk','utf-8',$this->result_string);

  	else $okstr = $this->result_string;

  	return $okstr;

  }

  function par_number($str) {

  	if($str == '') return '';

  	$ws = explode(' ',$str);

  	$wlen = count($ws);

  	$spc = $this->split_char;

  	$reStr = '';

  	for($i=0;$i<$wlen;$i++){

  		if($ws[$i]=='') continue;

  		if($i>=$wlen-1) $reStr .= $spc.$ws[$i];

  		else{ $reStr .= $spc.$ws[$i]; }

    }

    return $reStr;

  }

  function par_other($word_array) {

  	$wlen = count($word_array)-1;

  	$rsStr = '';

  	$spc = $this->split_char;

  	for($i=$wlen;$i>=0;$i--) {

  		if(preg_match('/'.$this->cn_sg_num.'/',$word_array[$i])) {

  			$rsStr .= $spc.$word_array[$i];

  			if($i>0 && preg_match('/^'.$this->common_unit.'/',$word_array[$i-1]) ) {

				$rsStr .= $word_array[$i-1]; $i--;

			} else {

  				while($i>0 && preg_match("/".$this->cn_sg_num."/",$word_array[$i-1]) ){ $rsStr .= $word_array[$i-1]; $i--; }

  			}

  			continue;

  		}

  		if(strlen($word_array[$i])==4 && isset($this->two_name_dic[$word_array[$i]])) {

  			$rsStr .= $spc.$word_array[$i];

  			if($i>0&&strlen($word_array[$i-1])==2){

  				$rsStr .= $word_array[$i-1];$i--;

  				if($i>0&&strlen($word_array[$i-1])==2){ $rsStr .= $word_array[$i-1];$i--; }

  			}

  		} else if(strlen($word_array[$i])==2 && isset($this->one_name_dic[$word_array[$i]])) {

  			$rsStr .= $spc.$word_array[$i];

  			if($i>0&&strlen($word_array[$i-1])==2){

  				 if(preg_match("/".$this->especial_char."/",$word_array[$i-1])) continue;

  				 $rsStr .= $word_array[$i-1];$i--;

  				 if($i>0 && strlen($word_array[$i-1])==2 &&

  				  !preg_match("/".$this->especial_char."/",$word_array[$i-1]))

  				 { $rsStr .= $word_array[$i-1];$i--; }

  			}

  		} else {

  			$rsStr .= $spc.$word_array[$i];

  		}

  	}

  	$rsStr = preg_replace("/^".$spc."/","",$rsStr);

  	return $rsStr;

  }

  function split_mm($str,$try_num_name=true,$try_diff=true) {

  	$spc = $this->split_char;

  	$spLen = strlen($str);

  	$rsStr = $okWord = $tmpWord = '';

  	$word_array = array();

  	for($i=($spLen-1);$i>=0;) {

  		if($i<=$this->min_len){

  			if($i==1){

  			  $word_array[] = substr($str,0,2);

  		  } else {

  			   $w = substr($str,0,$this->min_len+1);

  			   if($this->is_word($w)){

  			   	$word_array[] = $w;

  			   }else{

  				   $word_array[] = substr($str,2,2);

  				   $word_array[] = substr($str,0,2);

  			   }

  		  }

  			$i = -1; break;

  		}

  		if($i>=$this->max_len) $max_pos = $this->max_len;

  		else $max_pos = $i;

  		$isMatch = false;

  		for($j=$max_pos;$j>=0;$j=$j-2){

  			 $w = substr($str,$i-$j,$j+1);

  			 if($this->is_word($w)){

  			 	$word_array[] = $w;

  			 	$i = $i-$j-1;

  			 	$isMatch = true;

  			 	break;

  			 }

  		}

  		if(!$isMatch){

  			if($i>1) {

  				$word_array[] = $str[$i-1].$str[$i];

  				$i = $i-2;

  			}

  		}

  	}//End For



  	if($try_num_name) {

		$rsStr = $this->par_other($word_array);

	} else {

  		$wlen = count($word_array)-1;

  		for($i=$wlen;$i>=0;$i--){

  	  	$rsStr .= $spc.$word_array[$i];

  	  }

  	}

  	if($try_diff) $rsStr = $this->test_diff(trim($rsStr));

  	return $rsStr;

  }

  function auto_description($str,$keyword,$strlen) {

  	$this->source_string = $this->revise_string($this->source_string);

  	$spwords = explode(" ",$this->source_string);

  	$keywords = explode(" ",$this->keywords);

  	$regstr = "";

  	foreach($keywords as $k=>$v) {

  		if($v=="") continue;

  		if(ord($v[0])>0x80 && strlen($v)<3) continue;

  		if($regstr=="") $regstr .= "($v)";

  		else $regstr .= "|($v)";

  	}

  }

  function test_diff($str) {

  	$str = preg_replace("/ {1,}/"," ",$str);

  	if($str == ""||$str == " ") return "";

  	$ws = explode(' ',$str);

  	$wlen = count($ws);

  	$spc = $this->split_char;

  	$reStr = "";

  	for($i=0;$i<$wlen;$i++) {

  		if($i>=($wlen-1)) {

  			$reStr .= $spc.$ws[$i];

  		} else {

  			if($ws[$i]==$ws[$i+1]){

  				$reStr .= $spc.$ws[$i].$ws[$i+1];

  				$i++; continue;

  			}

  			if(strlen($ws[$i])==2 && strlen($ws[$i+1])<8 && strlen($ws[$i+1])>2) {

  				$addw = $ws[$i].$ws[$i+1];

  				$t = 6;

  				$testok = false;

  				while($t>=4) {

  				  $w = substr($addw,0,$t);

  				  if($this->is_word($w) && ($this->get_rank($w) > $this->get_rank($ws[$i+1])*2) ) {

  					   $limit_word = substr($ws[$i+1],strlen($ws[$i+1])-$t-2,strlen($ws[$i+1])-strlen($w)+2);

  					   if($limit_word!="") $reStr .= $spc.$w.$spc.$limit_word;

  					   else $reStr .= $spc.$w;

  					   $testok = true;

  					   break;

  				  }

  				  $t = $t-2;

  			  }

  			  if(!$testok) $reStr .= $spc.$ws[$i];

  			  else $i++;

  			} else if(strlen($ws[$i])>2 && strlen($ws[$i])<8 && strlen($ws[$i+1])>2 && strlen($ws[$i+1])<8) {

  				$t21 = substr($ws[$i+1],0,2);

  				$t22 = substr($ws[$i+1],0,4);

  				if($this->is_word($ws[$i].$t21)) {

  					if(strlen($ws[$i])==6||strlen($ws[$i+1])==6){

  						$reStr .= $spc.$ws[$i].$t21.$spc.substr($ws[$i+1],2,strlen($ws[$i+1])-2);

  						$i++;

  					} else {

  						$reStr .= $spc.$ws[$i];

  					}

  				} else if(strlen($ws[$i+1])==6) {

  					if($this->is_word($ws[$i].$t22)) {

  						$reStr .= $spc.$ws[$i].$t22.$spc.$ws[$i+1][4].$ws[$i+1][5];

  						$i++;

  					} else { $reStr .= $spc.$ws[$i]; }

  				} else if(strlen($ws[$i+1])==4) {

  					$addw = $ws[$i].$ws[$i+1];

  					$t = strlen($ws[$i+1])-2;

  					$testok = false;

  					while($t>0) {

  						$w = substr($addw,0,strlen($ws[$i])+$t);

  						if($this->is_word($w) && ($this->get_rank($w) > $this->get_rank($ws[$i+1])*2) ) {

  				       $limit_word = substr($ws[$i+1],$t,strlen($ws[$i+1])-$t);

  					     if($limit_word!="") $reStr .= $spc.$w.$spc.$limit_word;

  					     else $reStr .= $spc.$w;

  					     $testok = true;

  					     break;

  				    }

  				    $t = $t-2;

  					}

  					if(!$testok) $reStr .= $spc.$ws[$i];

  			    else $i++;

  				}else {

  					$reStr .= $spc.$ws[$i];

  				}

  			} else {

  				$reStr .= $spc.$ws[$i];

  			}

  		}

    }//End For

  	return $reStr;

  }

  function is_word($okWord){

  	$slen = strlen($okWord);

  	if($slen > $this->max_len) return false;

  	else return isset($this->rank_dic[$slen][$okWord]);

  }

  function revise_string($str) {

  	$spc = $this->split_char;

    $slen = strlen($str);

    if($slen==0) return '';

    $okstr = '';

    $prechar = 0; // 0-空白 1-英文 2-中文 3-符号

    for($i=0;$i<$slen;$i++){

      if(ord($str[$i]) < 0x81) {

        if(ord($str[$i]) < 33){

          //$str[$i]!="\r"&&$str[$i]!="\n"

          if($prechar!=0) $okstr .= $spc;

          $prechar=0;

          continue;

        } else if(preg_match("/[^0-9a-zA-Z@\.%#:\\/\\&_-]/",$str[$i])) {

          if($prechar==0) {

          	$okstr .= $str[$i]; $prechar=3;

          } else {

          	$okstr .= $spc.$str[$i]; $prechar=3;

          }

        } else {

        	if($prechar==2||$prechar==3) {

        		$okstr .= $spc.$str[$i]; $prechar=1;

        	} else {

        	  if(preg_match("/@#%:/",$str[$i])){ $okstr .= $str[$i]; $prechar=3; }

        	  else { $okstr .= $str[$i]; $prechar=1; }

        	}

        }

      } else{

        if($prechar!=0 && $prechar!=2) $okstr .= $spc;

        if(isset($str[$i+1])){

          $c = $str[$i].$str[$i+1];

          if(preg_match("/".$this->cn_number."/",$c)) {

          	$okstr .= $this->get_alab_num($c); $prechar = 2; $i++; continue;

          }

          $n = hexdec(bin2hex($c));

          if($n>0xA13F && $n < 0xAA40) {

            if($c=="《"){

            	if($prechar!=0) $okstr .= $spc." 《";

            	else $okstr .= " 《";

            	$prechar = 2;

            } else if($c=="》"){

            	$okstr .= "》 ";

            	$prechar = 3;

            } else{

            	if($prechar!=0) $okstr .= $spc.$c;

            	else $okstr .= $c;

            	$prechar = 3;

            }

          } else {

            $okstr .= $c;

            $prechar = 2;

          }

          $i++;

        }

      }//中文字符

    }//结束循环

    return $okstr;

  }

  function find_new_word($str,$maxlen=6) {

    $okstr = "";

    return $str;

  }

  function get_keyword($str,$ilen=-1) {

    if($str=='') return '';

    else $this->split_result($str,true,true);

    $okstr = $this->result_string;

    $ws = explode(' ',$okstr);

    $okstr = $wks = '';

    foreach($ws as $w) {

      $w = trim($w);

      if(strlen($w)<2) continue;

      if(!preg_match("/[^0-9:-]/",$w)) continue;

      if(strlen($w)==2&&ord($w[0])>0x80) continue;

      if(isset($wks[$w])) $wks[$w]++;

      else $wks[$w] = 1;

    }

    if(is_array($wks)) {

      arsort($wks);

      if($ilen==-1) {

		foreach($wks as $w=>$v) {

      		if($this->get_rank($w)>500) $okstr .= $w." ";

        }

      }  else {

        foreach($wks as $w=>$v){

          if((strlen($okstr)+strlen($w)+1)<$ilen) $okstr .= $w." ";

          else break;

        }

      }

    }

    if(CHARSET=='utf-8') $okstr = iconv('gbk','utf-8',$okstr);

    return trim($okstr);

  }

  function get_rank($w){

  	if(isset($this->rank_dic[strlen($w)][$w])) return $this->rank_dic[strlen($w)][$w];

  	else return 0;

  }

  function get_alab_num($fnum){

	  $nums = array("0","1","2","3","4","5","6",

	  "7","8","9","+","-","%",".",

	  "a","b","c","d","e","f","g","h","i","j","k","l","m",

	  "n","o","p","q","r","s ","t","u","v","w","x","y","z",

	  "A","B","C","D","E","F","G","H","I","J","K","L","M",

	  "N","O","P","Q","R","S","T","U","V","W","X","Y","Z");

	  $fnums = "0123456789+-%.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";

	  $fnum = str_replace($nums,$fnums,$fnum);

	  return $fnum;

  }

}

?>