emlog改造-sphinx全文检索支持

默认情况下,emlog的搜索效果非常差,底层实现就是文章标题,like判断,故需优化下~ 

mysql原生的全文检索的局限:

1.  mysql5.6以下的版本,InnoDB不支持;

2.  字段类型:char、varchar和text);

3. 需要先分词-入库;  分词-查找. 比如php的分词扩展php_scws,配置较为繁琐.

4. 仅适用于mysql,不同数据库有不同的方案.

基于如上几点局限, 因而转向了sphinx.


按照之前文章的步骤,把CoreSeek(Sphinx)安装与mmseg词库扩展 部署完成后, 

现在开始改造emlog,

1. 拿到sphinx php相关类和实例.

coreseek官网的压缩包中,路径如下: coreseek-3.2.14/testpack


2.将sphinxapi.php重命名为: sphinxclient.php ,放入emlog/include/lib/目录下.

(重命名的原因是为了便于emlog __autoload机制自动加载.)


3. 改造emlog/include/lib/search_controller.php 文件.

主要是两个地方: 替换获取数据源的函数 和 分页计算 .

目前在分页这里排序还有点不足, 根据SQL IN()返回的数据,排序已经不是sphinx返回的id顺序了,

因此还得按照sphinx的id顺序(匹配优先级)排序数组,然后再分页展示.

现在在排序这里还用的比较笨的方法,直接遍历全部符合要求的文档.

有更好的排序方法欢迎留言反馈~


<?php
/**
 * 搜索文章
 *
 * @copyright (c) Emlog All Rights Reserved
 */
 

class Search_Controller {
	
	
	
	function search_sphinx($search_str){
		$sphinx_client = new SphinxClient();

		$q = $search_str;
		$sql = "";
		//MatchMode:SPH_MATCH_ALL,SPH_MATCH_ANY,SPH_MATCH_BOOLEAN,SPH_MATCH_EXTENDED,SPH_MATCH_EXTENDED2,SPH_MATCH_PHRASE
		$mode = SPH_MATCH_ANY;
		$host = "localhost";
		$port = 9312;
		$index = "*";
		$groupby = "";
		$groupsort = "@group desc";
		$filter = "group_id";
		$filtervals = array();
		$distinct = "";
		$sortby = "";//@relevance DESC, @id DESC
		$sortmode = SPH_SORT_RELEVANCE;
		$limit = 20;
		$ranker = SPH_RANK_BM25;//SPH_RANK_BM25,SPH_RANK_NONE,SPH_RANK_WORDCOUNT,SPH_RANK_FIELDMASK
		$select = "";
  
		////////////
		// do query
		////////////
		
		$sphinx_client->SetServer ( $host, $port );
		$sphinx_client->SetConnectTimeout ( 3 );
		$sphinx_client->SetArrayResult ( true );
		$sphinx_client->SetWeights ( array ( 100, 1 ) );
		$sphinx_client->SetMatchMode ( $mode );
		if ( count($filtervals) )	$sphinx_client->SetFilter ( $filter, $filtervals );
		if ( $groupby )				$sphinx_client->SetGroupBy ( $groupby, SPH_GROUPBY_ATTR, $groupsort );
		if ( $sortby )				$sphinx_client->SetSortMode ( SPH_SORT_EXTENDED, $sortby );
		if ( $sortexpr )			$sphinx_client->SetSortMode ( SPH_SORT_EXPR, $sortexpr ); 
		if ( $sortmode)			$sphinx_client->SetSortMode ($sortmode );
		if ( $distinct )			$sphinx_client->SetGroupDistinct ( $distinct );
		if ( $select )				$sphinx_client->SetSelect ( $select );
		if ( $limit )				$sphinx_client->SetLimits ( 0, $limit, ( $limit>1000 ) ? $limit : 1000 );
		$sphinx_client->SetRankingMode ( $ranker ); 
		
		$ret = array();
		$ret['rs'] =  $sphinx_client->Query ( $q, $index );
		$ret['lasterror'] = $sphinx_client->GetLastError() ;
		$ret['warnning'] = $sphinx_client->GetLastWarning()  ; 
		$ret['keyword'] = $q;
		
		return  $this->sphinx_desc($ret);
 
	}
	
	function sphinx_desc($ret){
		$res = $ret['rs'];
		$show_data = array();
		$show_data['status'] = true;
	
		if($res==false){
			$show_data['status']=false;
			$show_data['errormsg']= "Query failed: " . $res['lasterror'] ;
			return $show_data;
		}
	
// 				print_r($res);
	
		//Query '' retrieved 2 of 2 matches in 0.019 sec.
		$show_data['total_found']=$res['total_found'];
		$show_data['total']=count($res["matches"]);
		$show_data['desc'] =  '查询关键字:<label class="sphinx_keyword">'.$ret['keyword'].'</label>, 共找到<label class="sphinx_numbers">'.$show_data['total'].'</label>条记录,出现<label class="sphinx_numbers">'.$show_data['total_found'] .'</label>次,耗时:<label class="sphinx_numbers">'.$res['time'] .'</label>秒.';
	
	
		// 		Query stats:
		// 		'测试' found 12 times in 5 documents
		// 		'手册' found 8 times in 5 documents
		$show_data['seg'] = array();
		if ( is_array($res["words"]) ){
			foreach ( $res["words"] as $word => $info ){
				$show_data['seg']["$word"] = '共有<label class="sphinx_numbers">'.$info['docs'].'</label>篇文章包含关键字,出现记录数:<label class="sphinx_numbers">'.$info['hits'].'</label>.';
			}
		}
	
		$show_data['sql_instr'] = '';
		$article_ids = array();
		if ( is_array($res["matches"]) )
		{
			$n = 1;
			foreach ( $res["matches"] as $docinfo )
			{
				$article_ids[] = $docinfo['id'] ;
				$n++;
			}
		}
	
		if(!empty($article_ids)){
			$show_data['sql_instr'] = implode(",",$article_ids) ;
		}
		$show_data['index'] = $article_ids;
// 		print_r($show_data);
		return $show_data;
	}
	
	function display($params) {
		$Log_Model = new Log_Model();
		$options_cache = Option::getAll();
		extract($options_cache);

		$page = isset($params[4]) && $params[4] == 'page' ? abs(intval($params[5])) : 1;
		$keyword = isset($params[1]) && $params[1] == 'keyword' ? trim($params[2]) : '';
		$keyword = addslashes(htmlspecialchars(urldecode($keyword)));
		$keyword = str_replace(array('%', '_'), array('\%', '\_'), $keyword);

		
		$pageurl = '';
		
		//TODO: sphinx全文检索功能
		$sphinx_data = $this->search_sphinx($keyword);
		if($sphinx_data['status']===false || $sphinx_data['total_found']<1){
			$logs = array();
		}else{
			
			//Original, Use MySQL like.
			/*
			$sqlSegment = "and (   (title like '%{$keyword}%')  OR  ( content like '%{$keyword}%' ) )  order by date desc";
			$lognum = $Log_Model->getLogNum('n', $sqlSegment);
	        $total_pages = ceil($lognum / $index_lognum);
	        if ($page > $total_pages) {
	            $page = $total_pages;
	        }
	        $pageurl .= BLOG_URL.'?keyword='.urlencode($keyword).'&page=';
	
			$logs = $Log_Model->getLogsForHome($sqlSegment, $page, $index_lognum);
			*/
			
			$lognum = $sphinx_data['total'];
			$total_pages = ceil($lognum / $index_lognum);
			if ($page > $total_pages) {
				$page = $total_pages;
			}
			
			$sqlSegment = "and gid IN (".$sphinx_data['sql_instr'].")  order by date desc";
			$unsorted_logs = $Log_Model->getLogsForHomeBySphinx($sqlSegment, $page, $index_lognum);
			 
			if(!empty($sphinx_data['seg'])){
				$k_index = 0;
				foreach ($sphinx_data['seg'] as $word =>$info){
					  //高亮显示关键字(分词后的)
					  foreach ($unsorted_logs as &$log) {
					  	  $log['title']  =  str_replace ( $word , "<label class='sphinx_highlight'>${word}</label>",$log['title']  );
					  	  $log['log_title']  =  str_replace ( $word , "<label class='sphinx_highlight'>${word}</label>",$log['log_title']  );
					  	  $log['content']  =  str_replace ( $word , "<label class='sphinx_highlight'>${word}</label>",$log['content']  );
					  	  $log['log_description']  =  str_replace ( $word , "<label class='sphinx_highlight'>${word}</label>",$log['log_description']  );
					  	  $log['log_url']  .= "&highlight_${k_index}=".urlencode($word) ;
					  }  	 
					  $k_index++;
				}
			}
			 //TODO:数组查找这里需要优化下啊... 
			 //解决: 根据sphinx返回的序号排序, 而此时mysql IN语句查询出来变为无序了.
			$unsorted_logs_new_key = array();
			foreach ($unsorted_logs as $u_log) { 
				$unsorted_logs_new_key["k_".$u_log['gid']] = $u_log;
			} 
			
			$first_count = !empty($page) ? ($page - 1) * $index_lognum : 0;
			
// 			echo  "first_count:$first_count,total_pages:$total_pages,page:$page,index_lognum:$index_lognum,lognum:$lognum.";
				
			$logs = array(); 
			$to_count = $first_count+$index_lognum;
			//fix数据较少时,展示多余bug
			if($to_count>$lognum){
				$to_count = $lognum;
			} 
			for ($log_index = $first_count; $log_index < $to_count ;$log_index++ ){			
				$array_index =$sphinx_data['index'][$log_index];
				$logs[] = $unsorted_logs_new_key["k_".$array_index] ;
			}
			 
			$pageurl .= BLOG_URL.'?keyword='.urlencode($keyword).'&page=';
			$page_url = pagination($lognum, $index_lognum, $page, $pageurl);
		}
		include View::getView('header');
		include View::getView('log_list');
	}
}


3.由于分页相关计算已经改变,因此需要修改下: emlog/include/model/log_model.php 的getLogsForHome()方法.

为了不影响原有搜索功能,新增方法:


/**
	 * 前台获取文章列表Sphinx
	 *
	 * @param string $condition
	 * @param int $page
	 * @param int $perPageNum
	 * @return array
	 */
	function getLogsForHomeBySphinx($condition = '', $page = 1, $perPageNum) {
		$timezone = Option::get('timezone');
		$limit='';
		$sql = "SELECT * FROM " . DB_PREFIX . "blog WHERE type='blog' and hide='n' and checked='y' $condition $limit";
		$res = $this->db->query($sql);
		$logs = array();
		while ($row = $this->db->fetch_array($res)) {
			$row['date'] += $timezone * 3600;
			$row['log_title'] = htmlspecialchars(trim($row['title']));
			$row['log_url'] = Url::log($row['gid']);
			$row['logid'] = $row['gid'];
			$cookiePassword = isset($_COOKIE['em_logpwd_' . $row['gid']]) ? addslashes(trim($_COOKIE['em_logpwd_' . $row['gid']])) : '';
			if (!empty($row['password']) && $cookiePassword != $row['password']) {
				$row['excerpt'] = '<p>[该文章已设置加密,请点击标题输入密码访问]</p>';
			} else {
				if (!empty($row['excerpt'])) {
					$row['excerpt'] .= '<p class="readmore"><a href="' . Url::log($row['logid']) . '">阅读全文&gt;&gt;</a></p>';
				}
			}
			$row['log_description'] = empty($row['excerpt']) ? breakLog($row['content'], $row['gid']) : $row['excerpt'];
			$row['attachment'] = '';
			$row['tag'] = '';
			$row['tbcount'] = 0;//兼容未删除引用的模板
			$logs[] = $row;
		}
		return $logs;
	}


4.列表展示,高亮关键字.以emlog默认模版为例:

log_list.php


<?php 
if (!empty($logs)):
if(strlen($keyword)>1):
?>
<div id="sphinx_result"><?php echo $sphinx_data['desc']  ?></div>
<div style="clear:both;"></div>
<?php 
endif;
foreach($logs as $value): 
?>
	<h2><?php topflg($value['top'], $value['sortop'], isset($sortid)?$sortid:'');
	if(strlen($keyword)>1){
			echo '<a href="'.$value['log_url'].'"  target="_blank">'.$value['log_title'].'</a>';
	}else{
			echo '<a href="'.$value['log_url'].'"  target="_self">'.$value['log_title'].'</a>';
	}
	?>
	</h2>


echo_log.php


		<?php 
			foreach ($_GET as $k =>$v){
				if(strpos($k,"highlight_")!==false){ 
					$log_content = str_replace ( $v , "<label class='sphinx_highlight'>${v}</label>",$log_content ); 
				}
			} 
		 echo $log_content;
		  ?>


5. 默认模版样式增加emlog/content/templates/default/main.css


#sphinx_result{
	padding-top:20px;
	font-size: 21px;
	text-decoration:none;
}
.sphinx_keyword{
	color: #E02528;
	font-weight: bold;
}

.sphinx_highlight{
	color: #E02528;
	background-color: #F7E875;
	font-weight: bold;
}

.sphinx_numbers{
	color: #1188C1;
}


6.最后一点: 优化sphinx的查询配置和查询选项.

由于博客需求很少,我就没有单独提供查选条件的选项了.

有兴趣的,可以参见详细的sphinx文档.

另,对于其他数据源支持,可以参见coreseek的文档.

记得加入启动项:

/usr/local/coreseek/bin/searchd -c /usr/local/coreseek/etc/csft_mysql.conf

===============

相关源码下载: include.rar

末了,附上本博客的sphinx全文检索截图(按照匹配度排序实现)~

| 0个评论