聚类（8）

论坛元老

Rank: 8 Rank: 8

UID: 1066743

1^#

打印

字体大小: tT

look_w发表于 2019-4-17 19:05 | 只看该作者

聚类（8）

Tokeniser.cs

/*
Tokenization
Author: Thanh Ngoc Dao - Thanh.dao@gmx.net
Copyright (c) 2005 by Thanh Ngoc Dao.
*/

using System;
using System.Collections;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using WawaSoft.Search.Common;


namespace WawaSoft.Search.Common
{
      /// <summary>
      /// Summary description for Tokeniser.
      /// Partition string into SUBwords
      /// </summary>
      internal class Tokeniser : ITokeniser
      {

         /// <summary>
         /// 以空白字符进行简单分词，并忽略大小写，
         /// 实际情况中可以用其它中文分词算法
         /// </summary>
         /// <param name="input"></param>
         /// <returns></returns>
         public IList<string> Partition(string input)
         {
            Regex r=new Regex("([ \\t{}():;. \n])");
            input=input.ToLower() ;

            String [] tokens=r.Split(input);

            List<string> filter=new  List<string>() ;

            for (int i=0; i < tokens.Length ; i++)
            {
                  MatchCollection mc=r.Matches(tokens);
                  if (mc.Count <= 0 && tokens.Trim().Length > 0
                     && !StopWordsHandler.IsStopword (tokens) )
                     filter.Add(tokens) ;
            }

            return filter.ToArray();
         }


         public Tokeniser()
         {
         }

      }
}

收藏分享评分

回复引用

订阅 TOP

返回列表