//按照标识表记标帜,进行分别操纵、可所以分句或者分词
def segment(text, segs):
words= []
last = 0
for i in range(len(segs)):
if segs[i] ==1:
words.append(text[last:i+1])
last = i+1
words.append(text[last:])
return words
//策画得分值
def evaluate(text, segs):
words= segment(text, segs)
text_size = len(words)
lexicon_size = len( .join(list(set(words))))
return text_size + lexicon_size
random import randint
//改变某一个标识表记标帜串的某一位(1变成0,0变成1)
def flip(segs, pos):
return segs[:pos] + str(1-int(segs[pos])) + segs[pos+1:]
//按照整数N,随机改变N个地位,形成一个猜测的序列
def flip_n(segs, n):
for i in range(n):
segs = flip(segs, randint(0,len(segs)-1))
return segs
//模仿退火算法
def anneal(text, segs, iterations, cooling_rate):
temperature = float(len(segs))
whiletemperature > 0.5:
best_segs,best = segs, evaluate(text, segs)
for i in range(iterations):
guess= flip_n(segs, int(round(temperature)))
score =evaluate(text, guess)
if score < best://对比猜测的和今朝的,那一种评价值斗劲小,选择那一种序列
best,best_segs = score, guess
score, segs = best,best_segs
temperature = temperature / cooling_rate
print evaluate(text, segs), segment(text, segs)
print return segs
//示例
>>>text = doyouseethekittyseethedoggydoyoulikethekittylikethedoggy
>>>seg1= 0000000000000001000000000010000000000000000100000000000
>>>anneal(text, seg1, 5000,1.2)
60[doyouseetheki, tty, see, thedoggy, doyouliketh, ekittylike, thedoggy]
58[doy, ouseetheki, ttysee, thedoggy, doy, o, ulikethekittylike, thedoggy]
56[doyou, seetheki, ttysee, thedoggy, doyou, liketh, ekittylike, thedoggy]
54[doyou, seethekit, tysee, thedoggy, doyou, likethekittylike, thedoggy]
120
53[doyou, seethekit, tysee, thedoggy, doyou, like, thekitty, like, thedoggy]
51[doyou, seethekittysee, thedoggy, doyou, like, thekitty, like, thedoggy]
42[doyou, see, thekitty, see, thedoggy, doyou, like, thekitty, like, thedoggy]
0000100100000001001000000010000100010000000100010000000
有了足够的数据,就可能以一个公道的正确度主动将文本分割成词汇。这种办法可用于
为那些词的鸿沟没有任何视觉默示的书写体系分词。
彼此相爱,却不要让爱成了束缚:不如让它成为涌动的大海,两岸乃是你们的灵魂。互斟满杯,却不要同饮一杯。相赠面包,却不要共食一个。一起歌舞欢喜,却依然各自独立,相互交心,却不是让对方收藏。因为唯有生命之手,方能收容你们的心。站在一起却不要过于靠近。—— 纪伯伦《先知》