日期:2014-05-20  浏览次数:20743 次

大家推荐一下判断两篇文章相似度方法?我尝试用lucene模糊搜索可以很好判断5、6个字的标题,但内容就不行了?有什么框架或方法可以实现?
我得从个个网站采集些内容,但得判断他们是否重复,确定用标题和其他信息不能达到很好效果,只能通过文章内容来判断,但内容也可能有些字被修改过,所以也不完全一样,可能多几个字,少几个字。通过什么方法或者什么框架可以判断两篇文章相似度?

------解决方案--------------------
用算法中的求最大相似子字符串的方法LCS或许可以,它可以找到两个字符串中最大相似的子字符串

Java code

/*
 * @author talent_marquis<甜菜侯爵>
 * Email: talent_marquis@163.com
 * Copyright (C) 2007 talent_marquis<甜菜侯爵>
 * All rights reserved.
 */


package ustc.mse.algorithms.dynamicProgramming;


/*
 * LCS, Longest-Common-Subsequence
 */
public class LCS
{    
    public enum DIRECTION{ TOP, TOP_LEFT, LEFT };
    private char[] first;
    private char[] second;
    private int[][] lcsTable;
    private DIRECTION[][] lcsAssistTable;
    private int lcsLength;
    private String lcs_str, lcsMatrix_str, lcsAssistMatrix_str;
    private StringBuffer str_buffer;
    
    public LCS( String str1, String str2 )
    {
        first = str1.toCharArray();
        second = str2.toCharArray();
        lcsTable = new int[ first.length + 1 ][ second.length + 1 ];
        lcsAssistTable = new DIRECTION[ first.length + 1 ][ second.length + 1];
        lcs_str = null;
        str_buffer = new StringBuffer();
    }    
    
    public static void main(String[] args)
    {
        String a = "我抄我抄我抄抄抄:明月几时有,把酒问青天,不知天上宫阙,今夕是何年";
        String b = "苏轼曾经写过“明月几时有,把酒问青天”的千古名句";
        
        LCS lcs = new LCS( a, b );        
        
        lcs.getLCSLength();
        lcs.runLCS();
        println( "最大相似子字符串长度是:" + lcs.getLCSLength() );
        println( "最大相似子字符串为:" + lcs.getLCS() );
    }
    
    public int getLCSLength()
    {
        lcsLength = getLCSLength( first, second );
        return lcsLength;
    }
    
    private int getLCSLength( char[] one, char[] two )
    {
        lcsTable = new int[ one.length + 1 ][ two.length + 1 ];
        lcsAssistTable = new DIRECTION[ one.length + 1 ][ two.length + 1];
        
        for ( int i = 0; i < one.length ; i++ )
        {
            lcsTable[ i ][ 0 ] = 0;
        }

        
        for ( int j = 0; j < two.length - 1; j++ )
        {
            lcsTable[ 0 ][ j ] = 0;
        }
        
        for ( int i = 0; i < one.length; i++ )
        {
            for ( int j = 0; j < two.length; j++ )
            {
                if ( one[ i ] == two[ j ] )
                {
                    lcsTable[ i + 1 ][ j + 1 ] = lcsTable[ i ][ j ] + 1;
                    lcsAssistTable[ i + 1 ][ j + 1 ] = DIRECTION.TOP_LEFT;    
                }
                else if ( lcsTable[ i ][ j + 1 ] >= lcsTable[ i + 1 ][ j ] )
                {
                    lcsTable[ i + 1 ][ j + 1 ] = lcsTable[ i ][ j + 1 ];
                    lcsAssistTable[ i + 1 ][ j + 1 ] = DIRECTION.TOP;
                }
                else
                {
                    lcsTable[ i + 1 ][ j + 1 ] = lcsTable[ i + 1 ][ j ];
                    lcsAssistTable[ i + 1 ][ j + 1 ] = DIRECTION.LEFT;
                }
            }
        }
        
        lcsLength = lcsTable[ one.length ][ two.length ];        
        return lcsLength;
    }
    
    public void runLCS()
    {
        runLCS( lcsAssistTable, first, first.length, second.length );
        lcs_str = str_buffer.toString();
    }
    
    private void runLCS( DIRECTION[][] lcsAssistTable, char[] one, int oneLength, int twoLength )
    {
        if( oneLength == 0 || twoLength == 0 )
        {
            return;
        }
            
        
        int i = oneLength ;
        int j = twoLength ;
        
        if( lcsAssistTable[ i ][ j ] == DIRECTION.TOP_LEFT )
        {
            runLCS( lcsAssistTable, one, i - 1, j -1 );
            str_buffer.append( one[ i - 1 ] );
        }
        else if ( lcsAssistTable[ i ][ j ] == DIRECTION.TOP )
        {
            runLCS( lcsAssistTable, one, i - 1, j );
        }
        else
        {
            runLCS( lcsAssistTable, one, i, j -1 );
        }
        
    }
    
    public String getLCSAssistMatrixString()
    {
        str_buffer = new StringBuffer();
        for( DIRECTION[] row: lcsAssistTable)
        {
            for( DIRECTION element : row )
            {
                if( element == DIRECTION.LEFT )
                {
                    str_buffer.append( "?? " );
                }
                else if (element == DIRECTION.TOP )
                {
                    str_buffer.append( "?? " );
                }
                else if (element == DIRECTION.TOP_LEFT)
                {
                    str_buffer.append( "?I " );
                }
                else
                {
                    //str_buffer.append( "\t" );
                }
            }
            str_buffer.append( "\n" );
        }
        lcsAssistMatrix_str = str_buffer.toString();
        
        return lcsAssistMatrix_str;
    }
    
    public String getLCSMatrixString()
    {
        str_buffer = new StringBuffer();
        for( int[] row: lcsTable)
        {
            for( int element : row )
            {
                str_buffer.append( element + " " );
            }
            str_buffer.append( "\n" );
        }
        lcsMatrix_str = str_buffer.toString();
        return lcsMatrix_str;
    }
    
    public static void print( Object o )
    {
        System.out.print( o );
    }
    
    public static void println( Object o )
    {
        System.out.println( o );
    }

    public String getLCS()
    {
        return lcs_str;
    }

    /**
     * @return first
     */
    public char[] getFirstCharArray()
    {
        return first;
    }
    

    /**
     * @return second
     */
    public char[] getSecondCharArray()
    {
        return second;
    }


    /**
     * @return lcsAssistTable
     */
    public DIRECTION[][] getLcsAssistTable()
    {
        return lcsAssistTable;
    }


    /**
     * @return lcsTable
     */
    public int[][] getLcsTable()
    {
        return lcsTable;
    }
}