2014年2月11日火曜日

ピアソン相関で類似性の計算

ピアソン相関によるスコア計算


ピアソン相関係数の計算式は

です、値は-1から1の間をとり、完全に相関する場合は1、相関がない場合0、逆相関の場合は-1となる模様

Pythonのコード


映画の評価データは前回と同じものを利用していますのでそちらを参照してください
#!/usr/bin/python
# -*- coding: utf-8 -*-

from MovieEval import critics
from math import sqrt

def sim_pearson(prefs, person1, person2):
    si = {}
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item] = 1
    n = len(si)
    if n == 0:
        return 0

    sum1 = sum([prefs[person1][it] for it in si])
    sum2 = sum([prefs[person2][it] for it in si])

    sum1Sq = sum([pow(prefs[person1][it], 2) for it in si])
    sum2Sq = sum([pow(prefs[person2][it], 2) for it in si])

    pSum = sum([prefs[person1][it] * prefs[person2][it] for it in si])

    num = pSum - (sum1 * sum2 / n)
    den = sqrt((sum1Sq - pow(sum1, 2) / n) * (sum2Sq - pow(sum2, 2) / n))
    if den == 0:
        return 0
    r = num / den
    return r

def calcScore(name, prefs):
    print("%s:" % name)
    for (nm, val) in prefs.items():
        print("   %20s    %20f" % (nm, sim_pearson(critics, name, nm)))
    print("--------------------------------------------------")

if __name__ == '__main__':
    for (name, ev) in critics.items():
        calcScore(name, critics)
これを実行すると
cuomo@karky7 ~ $ python pearson.py
Jack Matthews:
          Jack Matthews                1.000000
           Mick LaSalle                0.211289
           Claudia Puig                0.028571
              Lisa Rose                0.490990
                   Toby                0.662849
           Gene Seymour                0.963796
       Michael Phillips                0.134840
--------------------------------------------------
Mick LaSalle:
          Jack Matthews                0.211289
           Mick LaSalle                1.000000
           Claudia Puig                0.566947
              Lisa Rose                0.585491
                   Toby                0.924473
           Gene Seymour                0.411765
       Michael Phillips               -0.258199
--------------------------------------------------
Claudia Puig:
          Jack Matthews                0.028571
           Mick LaSalle                0.566947
           Claudia Puig                1.000000
              Lisa Rose                0.883883
                   Toby                0.893405
           Gene Seymour                0.314970
       Michael Phillips                1.000000
--------------------------------------------------
Lisa Rose:
          Jack Matthews                0.490990
           Mick LaSalle                0.585491
           Claudia Puig                0.883883
              Lisa Rose                1.000000
                   Toby                0.991241
           Gene Seymour                0.315264
       Michael Phillips                0.774597
--------------------------------------------------
Toby:
          Jack Matthews                0.662849
           Mick LaSalle                0.924473
           Claudia Puig                0.893405
              Lisa Rose                0.991241
                   Toby                1.000000
           Gene Seymour                0.381246
       Michael Phillips               -1.000000
--------------------------------------------------
Gene Seymour:
          Jack Matthews                0.963796
           Mick LaSalle                0.411765
           Claudia Puig                0.314970
              Lisa Rose                0.315264
                   Toby                0.381246
           Gene Seymour                1.000000
       Michael Phillips                0.204598
--------------------------------------------------
Michael Phillips:
          Jack Matthews                0.134840
           Mick LaSalle               -0.258199
           Claudia Puig                1.000000
              Lisa Rose                0.774597
                   Toby               -1.000000
           Gene Seymour                0.204598
       Michael Phillips                1.000000
--------------------------------------------------
cuomo@karky7 ~ $


Haskellでやると

評価データは前回のものを利用しています

import qualified Data.Map as M
import MovieEval
import Control.Monad (forM_)
import Text.Printf

sim_distance :: M.Map String [Eval] -> String -> String -> Double
sim_distance perfs person1 = \person2 -> sim_pearson p1 (M.lookup person2 perfs)
  where
    p1 = M.lookup person1 perfs

sim_pearson :: Maybe [Eval] -> Maybe [Eval] -> Double
sim_pearson (Just p1) (Just p2) = if den /= 0
                                 then num / den
                                 else 0
  where
    ev = map (\(v, w) -> (movEv v, movEv w)) [(x, y) |
                                              x <- p1,
                                              y <- p2,
                                              movName x == movName y]
    n = length ev
    xs = map fst ev
    ys = map snd ev
    sumx = sum xs
    sumy = sum ys
    sumxSq = sum[x*x | x <- xs]
    sumySq = sum[y*y | y <- ys]
    pSum = sum[x*y | (x, y) <- ev]
    num = pSum - (sumx * sumy / realToFrac n)
    den = sqrt ((sumxSq-(sumx*sumx) / realToFrac n)*(sumySq-(sumy*sumy) / realToFrac n))
sim_pearson _ _ = 0.0

calcScore :: String -> [String] -> (String -> String -> Double) -> IO()
calcScore name names f = do
  putStrLn $ name ++ ":"
  forM_ names $ \nm -> do
    printf "   %20s    %20f\n" nm (f name nm)
  putStrLn "--------------------------------------------------\n"

main :: IO()
main = do
  let perfs = getCritics
      names = M.keys perfs
  mapM_ (\name -> calcScore name names (sim_distance perfs)) names
cuomo@karky7 ~ $ runhaskell peason.hs
Claudia Puig:
           Claudia Puig                     1.0
           Gene Seymour     0.31497039417435607
          Jack Matthews     0.02857142857142857
              Lisa Rose       0.883883476483186
       Michael Phillips                     1.0
           Mick LaSalle      0.5669467095138411
                   Toby      0.8934051474415647
--------------------------------------------------

Gene Seymour:
           Claudia Puig     0.31497039417435607
           Gene Seymour                     1.0
          Jack Matthews       0.963795681875635
              Lisa Rose     0.31526414437773115
       Michael Phillips     0.20459830184114206
           Mick LaSalle     0.41176470588235276
                   Toby     0.38124642583151164
--------------------------------------------------

Jack Matthews:
           Claudia Puig     0.02857142857142857
           Gene Seymour       0.963795681875635
          Jack Matthews                     1.0
              Lisa Rose     0.49099025303098176
       Michael Phillips     0.13483997249264842
           Mick LaSalle     0.21128856368212925
                   Toby        0.66284898035987
--------------------------------------------------

Lisa Rose:
           Claudia Puig       0.883883476483186
           Gene Seymour     0.31526414437773115
          Jack Matthews     0.49099025303098176
              Lisa Rose                     1.0
       Michael Phillips      0.7745966692414834
           Mick LaSalle      0.5854905538443589
                   Toby      0.9912407071619299
--------------------------------------------------

Michael Phillips:
           Claudia Puig                     1.0
           Gene Seymour     0.20459830184114206
          Jack Matthews     0.13483997249264842
              Lisa Rose      0.7745966692414834
       Michael Phillips                     1.0
           Mick LaSalle     -0.2581988897471611
                   Toby                    -1.0
--------------------------------------------------

Mick LaSalle:
           Claudia Puig      0.5669467095138411
           Gene Seymour     0.41176470588235276
          Jack Matthews     0.21128856368212925
              Lisa Rose      0.5854905538443589
       Michael Phillips     -0.2581988897471611
           Mick LaSalle                     1.0
                   Toby      0.9244734516419049
--------------------------------------------------

Toby:
           Claudia Puig      0.8934051474415647
           Gene Seymour     0.38124642583151164
          Jack Matthews        0.66284898035987
              Lisa Rose      0.9912407071619299
       Michael Phillips                    -1.0
           Mick LaSalle      0.9244734516419049
                   Toby                     1.0
--------------------------------------------------
cuomo@karky7 ~ $
こんな感じで2人の間の類似性が簡単な計算で求められたりするとなかなかおもしろいですね。チョット気になるのが相関値が同一人物でないのに1.0で出てるのが、計算の誤差なんでしょうかね?....pythonでやってもhaskellでやっても1.0なのでいいとは思うのですが。

後で得意の電卓でやってみます 笑...


0 件のコメント:

コメントを投稿