日期:2013-04-03  浏览次数:20574 次

一个用php写的中文分词类


<?php
class Segmentation {
    var $options = array('lowercase' => TRUE,
                         'segment_english' => FALSE);
    var $dict_name = 'Unknown';
    var $dict_words = array();
    function setLowercase($value) {
        if ($value) {
            $this->options['lowercase'] = TRUE;
        } else {
            $this->options['lowercase'] = FALSE;
        }
        return TRUE;
    }
    function setSegmentEnglish($value) {
        if ($value) {
            $this->options['segment_english'] = TRUE;
        } else {
            $this->options['segment_english'] = FALSE;
        }
        return TRUE;
    }
    function load($dict_file) {
        if (!file_exists($dict_file)) {
            return FALSE;
        }
        $fp = fopen($dict_file, 'r');
        $temp = fgets($fp, 1024);
        if ($temp === FALSE) {
            return FALSE;
        } else {
            if (strpos($temp, "\t") !== FALSE) {
                list ($dict_type, $dict_name) = explode("\t", trim($temp));
            } else {
                $dict_type = trim($temp);
                $dict_name = 'Unknown';
            }
            $this->dict_name = $dict_name;
            if ($dict_type !== 'DICT_WORD_W') {
                return FALSE;
            }
        }
        while (!feof($fp)) {
            $this->dict_words[rtrim(fgets($fp, 32))] = 1;
        }
        fclose($fp);
        return TRUE;
    }
    function getDictName() {
        return $this->dict_name;
    }
    function segmentString($str) {
        if (count($this->dict_words) === 0) {
&nb