import pandas as pd
import math
1.数据预处理
docA = "The cat sat on my face"
docB = "The dog sat on my bed"
wordsA = docA. split( " " )
wordsB = docB. split( " " )
wordsSet = set ( wordsA) . union( set ( wordsB) )
print ( wordsSet)
{'on', 'my', 'face', 'sat', 'dog', 'The', 'cat', 'bed'}
2.计算词的频数
wordCountA = dict . fromkeys( wordsSet, 0 )
wordCountB = dict . fromkeys( wordsSet, 0 )
for word in wordsA:
wordCountA[ word] += 1
for word in wordsB:
wordCountB[ word] += 1
pd. DataFrame( [ wordCountA, wordCountB] )
on my face sat dog The cat bed 0 1 1 1 1 0 1 1 0 1 1 1 0 1 1 1 0 1
3.计算词的频率
def computeTF ( wordCount, docWords) :
tfDict = { }
docCount = len ( docWords)
for word, count in wordCount. items( ) :
tfDict[ word] = count / float ( docCount)
return tfDict
tfA = computeTF( wordCountA, wordsA)
tfB = computeTF( wordCountB, wordsB)
print ( "tfA " , tfA)
tfA {'on': 0.16666666666666666, 'my': 0.16666666666666666, 'face': 0.16666666666666666, 'sat': 0.16666666666666666, 'dog': 0.0, 'The': 0.16666666666666666, 'cat': 0.16666666666666666, 'bed': 0.0}
4.计算逆文档频率
def computeIDF ( docList) :
idfDict = { }
doc_len = len ( docList)
idfDict = dict . fromkeys( docList[ 0 ] . keys( ) , 0 )
for doc in docList:
for word, count in doc. items( ) :
if count > 0 :
idfDict[ word] += 1
for word, count in idfDict. items( ) :
idfDict[ word] = math. log10( ( doc_len + 1 ) / float ( count + 1 ) )
return idfDict
idf = computeIDF( [ wordCountA, wordCountB] )
print ( idf)
{'on': 0.0, 'my': 0.0, 'face': 0.17609125905568124, 'sat': 0.0, 'dog': 0.17609125905568124, 'The': 0.0, 'cat': 0.17609125905568124, 'bed': 0.17609125905568124}
5.计算 TF-IDF
def computeTFIDF ( tf, idf) :
tfidf = { }
for word, tf in tf. items( ) :
tfidf[ word] = tf * idf[ word]
return tfidf
tfidfA = computeTFIDF( tfA, idf)
tfidfB = computeTFIDF( tfB, idf)
pd. DataFrame( [ tfidfA, tfidfB] )
on my face sat dog The cat bed 0 0.0 0.0 0.029349 0.0 0.000000 0.0 0.029349 0.000000 1 0.0 0.0 0.000000 0.0 0.029349 0.0 0.000000 0.029349