对MYSQL进行全文检索的PHP类库

正在研究中,如果要看这个程序的详细说明和演示,请到这个地方看,真的很爽: http://steven.haryan.to/php/KwIndex.html

注意,只能在linux,unix下用。

< ?php

$debug = 0;

if($debug) require “Dumper.lib”; //这个全文检索需要的库文件,你有吗?

function _debug() {

global $debug;

$args = func_get_args();

if (!$debug) return;

echo ”

debug: ", htmlentities(join("", $args)), "
\n";

}

class KwIndex {

# CONSTRUCTOR

#############

function KwIndex($args) {

# check for argument type

if (!is_array(&$args))

die(“KwIndex: constructor: syntax: KwIndex(array \$args)”);

# check for unknown arguments

$known_arguments = array_flip(array(

“linkid”, “db_name”, “hostname”, “username”, “password”,

“index_name”, “wordlist_cardinality”, “doclist_cardinality”,

“stoplist_cardinality”, “vectorlist_cardinality”,

“max_word_length”, “use_persistent_connection”));

while(list($k,$v) = each($args))

if (!isset($known_arguments[$k]))

die(“KwIndex: constructor: unknown argument `$k’”);

# required for required arguments

if (!isset($args["db_name"]))

die(“KwIndex: constructor: You must specify ‘db_name’”);

if (!isset($args["linkid"]) &&

(!isset($args["hostname"]) || !isset($args["username"]) ||

!isset($args["password"])))

die(“KwIndex: constructor: You must either specify ‘linkid’ or “.

“arguments to mysql_connect (‘hostname’, ‘username’, and “.

“‘password’)”);

# supply default values for optional arguments

if (!isset($args["index_name"]))

$args["index_name"] = “kwindex”;

if (!isset($args["wordlist_cardinality"]))

$args["wordlist_cardinality"] = 100000;

if (!isset($args["stoplist_cardinality"]))

$args["stoplist_cardinality"] = 10000;

if (!isset($args["vectorlist_cardinality"]))

$args["vectorlist_cardinality"] = 100000000;

if (!isset($args["doclist_cardinality"]))

$args["doclist_cardinality"] = 1000000;

if (!isset($args["max_word_length"]))

$args["max_word_length"] = 32;

if (!isset($args["use_persistent_connection"]))

$args["use_persistent_connection"] = 1;

# set object attributes

$this->db_name = $args["db_name"];

$this->index_name = $args["index_name"];

$this->wordlist_cardinality = $args["wordlist_cardinality"];

$this->stoplist_cardinality = $args["stoplist_cardinality"];

$this->vectorlist_cardinality = $args["vectorlist_cardinality"];

$this->doclist_cardinality = $args["doclist_cardinality"];

$this->max_word_length = $args["max_word_length"];

if (!isset($args["linkid"])) {

if ($args["use_persistent_connection"]) {

$linkid = mysql_pconnect($args["hostname"], $args["username"],

$args["password"]);

} else {

$linkid = mysql_connect($args["hostname"], $args["username"],

$args["password"]);

}

if (!$linkid)

die(“KwIndex: constructor: Can’t connect to database: “.

mysql_error());

} else {

$linkid = $args["linkid"];

}

$this->linkid = $linkid;

$idx = $this->index_name;

if (!mysql_select_db($this->db_name, $linkid))

die(“KwIndex: constructor: Can’t select DB: “.

mysql_error($linkid));

if (!$this->_index_exists()) $this->_create_index();

# load stoplist as keys of array

$this->stoplist = array();

if (!($res = mysql_query(“SELECT word FROM ${idx}_stoplist”,

$linkid)))

die(“KwIndex: constructor: Can’t load stoplist: “.

mysql_error($linkid));

while($row = mysql_fetch_row($res)) {

$this->stoplist[ strtolower($row[0]) ] = 1;

}

} // constructor

# PUBLIC METHODS

################

function &document_sub($doc_ids) {

die(“KwIndex: document_sub: this method must be overriden”);

}

function add_document($doc_ids) {

if (!is_array(&$doc_ids))

die(“KwIndex: syntax: add_document(array \$doc_ids)”);

if (!sizeof(&$doc_ids)) return 1;

$wordlist = array();

# structure: ( ‘word1′ => [ [doc_id,freq], … ], … )

$doclist = array();

# format: ( doc_id => n, … ); # n = number of words in document

# retrieve documents

####################

$docs = $this->document_sub(&$doc_ids);

if(!is_array(&$docs))

die(“KwIndex: add_document: “.

“‘document_sub’ does not return an array”);

if (sizeof(&$doc_ids) < sizeof(&$docs))

die(“KwIndex: add_document: “.

“‘document_sub’ does not return enough documents”);

if (sizeof(&$doc_ids) > sizeof(&$docs))

die(“KwIndex: add_document: “.

“‘document_sub’ returns too many documents”);

# split documents into words

############################

while(list($id, $doc) = each($docs)) {

if (!isset($doc) || !strlen($doc)) continue;

$words = $this->_split_to_words($doc);

$num_of_words = sizeof(&$words);

# note: this means that numbers, etc are counted

$doclist[$id] = $num_of_words;

# filter non-qualifying words: 1-char length, numbers, words

# that are too long

$w2 = array();

while(list($k, $v) = each($words)) {

$len = strlen($v);

$lower_v = strtolower($v);

if ($len > 1 &&

$len < = $this->max_word_length &&

preg_match(“/[a-z]/”, $lower_v) &&

!isset($this->stoplist[$lower_v])) ++$w2[ $lower_v ];

}

while(list($k, $v) = each($w2)) {

$lower_k = strtolower($k);

if (!isset($wordlist[$lower_k]))

$wordlist[$lower_k] = array();

array_push($wordlist[$lower_k], array($id, $v/$num_of_words));

}

}

#_debug(“wordlist: “, Dumper($wordlist));

# submit to database

####################

$linkid = $this->linkid;

$idx = $this->index_name;

# lock the tables in case some other process remove a certain word

# between step 0 and 1 and 2 and 3

if(!mysql_query(“LOCK TABLES ${idx}_doclist WRITE, “.

“${idx}_vectorlist WRITE, “.

“${idx}_wordlist WRITE”,

$linkid)) {

$this->ERROR = “Can’t lock tables when adding documents: “.

mysql_error($linkid);

return;

}

# 0

# add the docs first

#_debug( “doclist = “, Dumper($doclist));

while(list($k,$v) = each($doclist)) {

if(!mysql_query(“REPLACE INTO ${idx}_doclist (id,n) VALUES (“.

“‘”.(addslashes($k)).”‘”.

“,”.

“‘”.(addslashes($v)).”‘”.

“)”,

$linkid)) {

$this->ERROR = “Can’t add doc id=`$_’ to doclist: “.

mysql_error($linkid);

mysql_query(“UNLOCK TABLES”, $linkid);

return;

}

}

# 1

# and then add the words

while(list($k,$v) = each($wordlist)) {

if(!mysql_query(“INSERT IGNORE INTO ${idx}_wordlist (word) “.

“VALUES (“.

“‘”.(addslashes($k)).”‘”.

“)”,

$linkid)) {

$this->ERROR = “Can’t add word `$k’ to wordlist: “.

mysql_error($linkid);

mysql_query(“UNLOCK TABLES”, $linkid);

return

;

}

}

# 2

# get the resulting word ids

$word_ids = array();

$e_wordlist = array();

reset($wordlist);

while(list($k,$v) = each($wordlist))

array_push($e_wordlist, “‘”.addslashes($k).”‘”);

if (sizeof(&$wordlist)) {

if(!($res = mysql_query(“SELECT id,word FROM ${idx}_wordlist “.

“WHERE word IN (“.

join(‘,’, $e_wordlist).

“)”,

$linkid))) {

$this->ERROR = “Can’t get data from wordlist: “.

mysql_error($linkid);

mysql_query(‘UNLOCK TABLES’, $linkid);

return;

}

while($row = mysql_fetch_row($res)) {

#echo “row = “, Dumper($row), ”
\n”;

$word_ids[ $row[1] ] = $row[0];

}

}

# 3

# now add the vectors

#_debug(“word_ids = “, Dumper($word_ids));

reset($wordlist);

while (list($word, $hitlist) = each($wordlist)) {

while(list($i, $hit) = each($hitlist)) {

#echo “adding word=$word, hit = ($hit[0], $hit[1])
\n”;

if(!mysql_query(“INSERT INTO ${idx}_vectorlist (wid,did,f)”.

“VALUES (“.

(“‘”.addslashes($word_ids[$word])).”‘,”.

(“‘”.addslashes($hit[0])).”‘,”.

(“‘”.addslashes($hit[1])).”‘)”,

$linkid)) {

$this->ERROR = “Can’t add to vectorlist: “.

mysql_error($linkid);

mysql_query(‘UNLOCK TABLES’, $linkid);

return;

}

}

}

# if all goes well, return TRUE

mysql_query(‘UNLOCK TABLES’, $linkid);

return 1;

}

function remove_document($doc_ids) {

if (!is_array(&$doc_ids))

die(“KwIndex: syntax: remove_document(array \$doc_ids)”);

if (!sizeof(&$doc_ids)) return 1;

$linkid = $this->linkid;

$idx = $this->index_name;

if(!mysql_query(“LOCK TABLES ${idx}_doclist WRITE, “.

“${idx}_vectorlist WRITE”,

$linkid)) {

$this->ERROR = “Can’t lock tables when removing documents: “.

mysql_error($linkid);

return;

}

if(!mysql_query(“DELETE FROM ${idx}_doclist WHERE id IN (“.

join(‘,’, $doc_ids).”)”,

$linkid)) {

$this->ERROR = “Can’t delete from doclist: “.

mysql_error($linkid);

mysql_query(‘UNLOCK TABLES’, $linkid);

return;

}

if(!mysql_query(“DELETE FROM ${idx}_vectorlist WHERE did IN (“.

join(‘,’, $doc_ids).”)”,

$linkid)) {

$this->ERROR = “Can’t delete from vectorlist: “.

mysql_error($linkid);

mysql_query(‘UNLOCK TABLES’, $linkid);

return;

}

# if all goes well, return TRUE

mysql_query(“UNLOCK TABLES”, $linkid);

return 1;

}

function update_document($doc_ids) {

if (!is_array(&$doc_ids))

die(“KwIndex: syntax: update_document(array \$doc_ids)”);

if (!sizeof(&$doc_ids)) return 1;

return $this->remove_document(&$doc_ids) &&

$this->add_document(&$doc_ids);

}

# find all words that are contained in at least $k % of all documents

function &common_word($k = 80) {

$linkid = $this->linkid;

$idx = $this->index_name;

# first select the number of documents

$num = $this->document_count();

if ($num == -1) {

$this->ERROR = “Can’t retrieve the number of documents: “.

mysql_error($linkid);

return;

}

# get the statistics from vectorlist

if(!($res1 = mysql_query(“SELECT wid,COUNT(*)/$num as k FROM

${idx}_vectorlist GROUP BY wid HAVING k>=”.

($k/100),

$linkid))) {

$this->ERROR = “Can’t retrieve common words: “.

mysql_error($linkid);

return;

}

$wids = array();

while($row = mysql_fetch_row($res1)) array_push($wids, $row[0]);

# convert it to word by consulting the wordlist table

$words = array();

if (sizeof(&$wids)) {

if (!($res2 = mysql_query(“SELECT word FROM ${idx}_wordlist “.

“WHERE id IN (“.join(‘,’, $wids).

“)”,

$linkid))) {

$this->ERROR = “Can’t retrieve common words: “.

mysql_error($linkid);

return;

}

while($row = mysql_fetch_row($res2))

array_push($words, $row[0]);

}

return $words;

}

# find all words that are not contained in all documents (vectorlist)

# XXX not yet written

function &orphan_word() {

return array();

}

# remove words from index

function remove_word($words) {

if (!is_array(&$words))

die(“KwIndex: syntax: remove_word(array \$words)”);

$linkid = $this->linkid;

$idx = $this->index_name;

if(!mysql_query(“LOCK TABLES ${idx}_wordlist WRITE, “.

“${idx}_vectorlist WRITE”,

$linkid)) {

$this->ERROR = “Can’t lock tables when removing words: “.

mysql_error($linkid);

return;

}

$e_words = array();

while(list($k,$v) = each($words))

array_push($e_words, “‘”.addslashes(strtolower($v)).”‘”);

# retrieve word ids

if(!($res0 = mysql_query(“SELECT id FROM ${idx}_wordlist WHERE “.

“word IN (“.join(‘,’, $e_words).”)”,

$linkid))) {

$this->ERROR = “Can’t delete from wordlist: “.

mysql_error($linkid);

mysql_query(‘UNLOCK TABLES’, $linkid);

return;

}

$word_ids = array();

while($row = mysql_fetch_row($res0))

array_push($word_ids, $row[0]);

$word_ids = join(‘,’, $word_ids); # we’ll make it a string

if (!strlen($word_ids)) return 1;

# delete from wordlist

if(!mysql_query(“DELETE FROM ${idx}_wordlist WHERE id IN “.

“($word_ids)”,

$linkid)) {

$this->ERROR = “Can’t delete from wordlist: “.

mysql_error($linkid);

mysql_query(“UNLOCK TABLES”, $linkid);

return;

}

if(!mysql_query(“DELETE FROM ${idx}_vectorlist WHERE wid IN “.

“($word_ids)”, $linkid)) {

$this->ERROR = “Can’t delete from vectorlist: “.

mysql_error($linkid);

mysql_query(“UNLOCK TABLES”, $linkid);

return;

}

return 1;

}

# add stop words. note: you must manually delete previously indexed

# words with delete_word()

function add_stop_word($words) {

if (!is_array(&$words))

die(“KwIndex: syntax: add_stop_word(array words)”);

if (!sizeof(&$words)) return 1;

$linkid = $this->linkid;

$idx = $this->index_name;

while(list($k, $v) = each($words)) {

$lv = strtolower($v);

if(!mysql_query(“REPLACE INTO ${idx}_stoplist (word) VALUES (“.

(“‘”.addslashes($lv).”‘”).

“)”, $linkid)) {

$this->ERROR = “Can’t add to stoplist: “.

mysql_error($linkid);

return;

}

$this->stoplist[ $lv ] = 1;

}

return 1;

}

# remove stop words from index

function remove_stop_word($words) {

if (!is_array(&$words))

die(“KwIndex: syntax: remove_stop_word(array words)”);

if (!sizeof(&$words)) return 1;

$linkid = $this->linkid;

$idx = $this->index_name;

$e_words = array();

while(list($k,$v) = each($words))

array_push($e_words, “‘”.addslashes(strtolower($v)).”‘”);

if(!mysql_query(“DELETE FROM ${idx}_stoplist WHERE word IN (“.

join(‘,’, $e_words).”)”,

$linkid)) {

$this->ERROR = “Can’t delete from stoplist: “.

mysql_error($linkid);

return;

}

while(list($k,$v) = each($words))

unset($this->stoplist[ strtolower($v) ]);

return 1;

}

function is_stop_word($word) {

return isset($this->stoplist[ strtolower($word) ]);

}

function &_search_or_match_count($is_count, &$args) {

if (!isset($args["words"]))

die(“KwIndex: search: option ‘words’ must be defined”);

$linkid = $this->linkid;

$idx = $this->index_name;

# split the words if we are offered a single string/not array

# (assume it’s a phrase)

$words = array();

if (is_array(&$args["words"])) {

$words = $args["words"];

} else {

$words = $this->_split_to_words($args["words"]);

}

# delete duplicate words, convert them all to lowercase

$hashwords = array();

while(list($k,$v) = each($words)) $hashwords[ strtolower($v) ] = 1;

$words = array_keys(&$hashwords);

if (!sizeof(&$words)) {

if ($is_count) {

return 0;

} else {

return array();

}

}

# first we retrieve the word ids

$op = $args["re"] ? ‘REGEXP’:'LIKE’;

$bool = isset($args['boolean']) && $args['boolean'] &&

strtoupper($args['boolean']) == ‘AND’ ? ‘AND’:'OR’;

$op_phrases = array();

while(list($k,$v) = each($words))

array_push($op_phrases, “word $op ‘”.addslashes($v).”‘”);

if (!($res0 = mysql_query(“SELECT id FROM ${idx}_wordlist WHERE “.

join(‘ OR ‘, $op_phrases),

$linkid))) {

$this->ERROR = “Can’t retrieve word ids: “.mysql_error($linkid);

return;

}

$word_ids = array();

while($row = mysql_fetch_row($res0)) array_push($word_ids, $row[0]);

if (!sizeof(&$word_ids) ||

($bool == ‘AND’ && sizeof(&$word_ids) < sizeof(&$words))) {

if ($is_count) {

return 0;

} else {

return array();

}

}

# and then we search the vectorlist

$can_optimize=0;

$stmt = ‘;

if ($is_count) {

if ($bool == ‘AND’ && !$args['re']) {

$stmt = ‘SELECT did,count(wid) as c ‘.

“FROM ${idx}_vectorlist WHERE wid IN (“.

join(‘,’,$word_ids).

“) “.

“GROUP BY did “.

“HAVING c >= “.sizeof(&$word_ids);

} else {

$can_optimize=1;

$stmt = “SELECT COUNT(DISTINCT did) “.

“FROM ${idx}_vectorlist WHERE wid IN (“.

join(‘,’,$word_ids).

“)”;

}

} else { // ! $is_count

$stmt = “SELECT did, count(wid) as c, avg(f) as a, “.

“count(wid)*count(wid)*count(wid)*avg(f) as ca “.

“FROM ${idx}_vectorlist WHERE wid IN (“.

join(‘,’,$word_ids).

“) “.

“GROUP BY did “.

($bool == ‘AND’ && !$args['re'] ?

“HAVING c >= “.sizeof(&$word_ids):’).

” ORDER BY ca DESC “.

(isset($args['num']) ? “LIMIT ” . (isset($args['start']) ?

(($args['start'] – 1).”,”.$args['num']) : $args['num'])

:’);

}

_debug(“search SQL: “, $stmt);

if(!($res = mysql_query($stmt, $linkid))) {

$this->ERROR = “Can’t search vectorlist: “.mysql_error($linkid);

return;

}

if ($is_count) {

if ($can_optimize) {

$row = mysql_fetch_row($res);

return $row[0];

} else {

return mysql_num_rows($res);

}

} else { // ! $is_count

$doc_ids = array();

while($row = mysql_fetch_row($res)) array_push($doc_ids, $row[0]);

return $doc_ids;

}

}

function &search($args) {

return $this->_search_or_match_count(0, &$args);

}

function &match_count($args) {

return $this->_search_or_match_count(1, &$args);

}

function remove_index() {

$linkid = $this->linkid;

$idx = $this->index_name;

if(!mysql_query(“DROP TABLE IF EXISTS ${idx}_wordlist”, $linkid)) {

$this->ERROR = “Can’t remove table ${idx}_wordlist: “.

mysql_error($linkid);

return;

}

if(!mysql_query(“DROP TABLE IF EXISTS ${idx}_doclist”, $linkid)) {

$this->ERROR = “Can’t remove table ${idx}_doclist: “.

mysql_error($linkid);

return;

}

if(!mysql_query(“DROP TABLE IF EXISTS ${idx}_vectorlist”, $linkid)) {

$this->ERROR = “Can’t remove table ${idx}_vectorlist: “.

mysql_error($linkid);

return;

}

if(!mysql_query(“DROP TABLE IF EXISTS ${idx}_stoplist”, $linkid)) {

$this->ERROR = “Can’t remove table ${idx}_stoplist: “.

mysql_error($linkid);

return;

}

return 1;

}

function empty_index() {

return $this->remove_index() && $this->_create_index();

}

# number of documents in the collection

function document_count() {

$linkid = $this->linkid;

$idx = $this->index_name;

if (!($res = mysql_query(“SELECT COUNT(*) FROM ${idx}_doclist”,

$linkid))) return;

$row = mysql_fetch_row($res);

return $row[0];

}

# number of unique words

function word_count() {

$linkid = $this->linkid;

$idx = $this->index_name;

if (!($res = mysql_query(“SELECT COUNT(*) FROM ${idx}_wordlist”,

$linkid))) return;

$row = mysql_fetch_row($res);

return $row[0];

}

# PRIVATE METHODS

#################

function &_split_to_words($str) {

preg_match_all(“/\b(\w[\w']*\w+|\w+)\b/”, $str, $matches);

return $matches[1];

}

>function _create_index() {

$linkid = $this->linkid;

$idx = $this->index_name;

$stmt = ‘;

# drop previous tables, if they exist

if (!$this->remove_index()) return;

# create doclist table

$stmt = “CREATE TABLE ${idx}_doclist ” .

‘(id ‘ . $this->_int_column_type($this->doclist_cardinality).

‘ AUTO_INCREMENT PRIMARY KEY, ‘ .

‘ n ‘ . $this->_int_column_type($this->wordlist_cardinality).

‘ NOT NULL’.

‘)’;

if (!mysql_query($stmt, $linkid))

die(“KwIndex: Can’t create table ${idx}_doclist: “.

mysql_error($linkid));

# create wordlist table

$stmt = “CREATE TABLE ${idx}_wordlist ” .

‘(id ‘ . $this->_int_column_type($this->wordlist_cardinality) .

‘ AUTO_INCREMENT PRIMARY KEY, ‘ .

” word VARCHAR($this->max_word_length)” .

‘ BINARY NOT NULL, ‘ .

“UNIQUE (word) ” .

‘)’;

if(!mysql_query($stmt, $linkid))

die(“KwIndex: Can’t create table ${idx}_wordlist: “.

mysql_error($linkid));

# create stoplist table

$stmt = “CREATE TABLE ${idx}_stoplist ” .

‘(id ‘ . $this->_int_column_type($this->wordlist_cardinality).

‘ AUTO_INCREMENT PRIMARY KEY, ‘ .

” word VARCHAR($this->max_word_length)” .

‘ BINARY NOT NULL, ‘ .

“UNIQUE (word) ” .

‘)’;

if (!mysql_query($stmt, $linkid))

die(“KwIndex: Can’t create table ${idx}_stoplist: “.

mysql_error($linkid));

# create vectorlist table

$stmt = “CREATE TABLE ${idx}_vectorlist ” .

‘(wid ‘. $this->_int_column_type($this->wordlist_cardinality).

‘ NOT NULL, ‘ .

‘did ‘. $this->_int_column_type($this->doclist_cardinality).

‘ NOT NULL, ‘.

‘UNIQUE (wid,did), ‘.

‘f FLOAT(10,4) NOT NULL’.

‘)’;

if (!mysql_query($stmt, $linkid)) {

die(“KwIndex: Can’t create table ${idx}_stoplist: “.

mysql_error($linkid));

}

$this->stoplist = array();

return 1;

}

function _int_column_type($cardinality) {

if ($cardinality >= 16*1024*1024) return ‘INT UNSIGNED’;

if ($cardinality >= 64*1024) return ‘MEDIUMINT UNSIGNED’;

if ($cardinality >= 256) return ‘SMALLINT UNSIGNED’;

return ‘TINYINT UNSIGNED’;

}

function &_index_tables() {

$idx = $this->index_name;

$tables = array(

“${idx}_doclist”,

“${idx}_wordlist”,

“${idx}_vectorlist”,

“${idx}_stoplist”);

return $tables;

}

function _index_exists() {

$linkid = $this->linkid;

$idx = $this->index_name;

if (!($res = mysql_list_tables($this->db_name, $linkid)))

die(“Can’t list table: “.mysql_error($linkid));

$existing_tables = array();

$i = 0;

while ($i < mysql_num_rows($res)) {

$existing_tables[ mysql_tablename($res, $i) ] = 1;

$i++;

}

$index_tables = $this->_index_tables();

while(list($k,$v) = each($index_tables)) {

if (!isset($existing_tables[$v])) return 0;

}

return 1;

}

} // class

?>



评论权限被关闭.



赞助商

文章索引模板

好友推荐链接

强力推荐链接

分类目录

   

统计信息

Translator

Chinese (Simplified) flagItalian flagKorean flagChinese (Traditional) flagPortuguese flagEnglish flagGerman flagFrench flagSpanish flagJapanese flagArabic flagRussian flagGreek flagDutch flagBulgarian flagCzech flag
Croatian flagDanish flagFinnish flagPolish flagSwedish flagNorwegian flag          

标签

专利战 世界 中国 为什么 介绍 使用 公司 分析 利用 功能 原谅我红尘颠倒 发现 天涯 如何 实现 工具 应该 慕容雪村 技术 插件 搜索引擎 支持 数据库 文件 方式 时间 服务器 用户 简单 管理 系统 网站 美国 谁的心不曾柔软 进行 部分 问题 AJAX Google LAN Linux MySQL PHP plugin WordPress

热门浏览