|
Haskell Snippet, start of bayes probability library
Libraries used: import System.Environment import qualified Data.Map as Map import qualified Data.Set as Set import Data.List import Text.Regex (splitRegex, mkRegex) Type definitions: type WordCat = (String, String) type WordCatInfo = (WordCat, Int) type WordInfo = (String, Int) Utilities for finding the word frequency in a document: -- -- | Find word frequency given an input list using "Data.Map" utilities. -- With (Map.empty :: Map.Map String Int), set k = String and a = Int -- Map.empty :: Map k a -- foldl' is a strict version of foldl = foldl': (a -> b -> a) -> a -> [b] -> a -- Also see: updmap nm key = Map.insertWith (+) key 1 nm -- (Original code from John Goerzen's wordFreq) wordFreq :: [String] -> [WordInfo] wordFreq inlst = Map.toList $ foldl' updateMap (Map.empty :: Map.Map String Int) inlst where updateMap freqmap word = case (Map.lookup word freqmap) of Nothing -> (Map.insert word 1 freqmap) Just x -> (Map.insert word $! x + 1) freqmap -- -- | Word Category Frequency, modified version of wordFreq to -- handle Word Category type. wordCatFreq :: [WordCat] -> [WordCatInfo] wordCatFreq inlst = Map.toList $ foldl' updateMap (Map.empty :: Map.Map WordCat Int) inlst where updateMap freqmap wordcat = case (Map.lookup wordcat freqmap) of Nothing -> (Map.insert wordcat 1 freqmap) Just x -> (Map.insert wordcat $! x + 1) freqmap -- | Pretty print the word/count tuple and output a string. formatWordFreq :: WordInfo -> String formatWordFreq tupl = fst tupl ++ " " ++ (show $ snd tupl) formatWordCat :: WordCatInfo -> String formatWordCat tupl = frmtcat (fst tupl) ++ " " ++ (show $ snd tupl) where frmtcat infotupl = (fst infotupl) ++ ", " ++ (snd infotupl) Utilities for calculating the fisher probability: wordFreqSort :: [String] -> [(String, Int)] wordFreqSort inlst = sortBy freqSort . wordFreq $ inlst -- -- | bayes classification train trainClassify :: String -> String -> [WordCatInfo] trainClassify content cat = let tokens = splitRegex (mkRegex "\\s*[ \t\n]+\\s*") content wordcats = [(tok, cat) | tok <- tokens] in wordCatFreq wordcats -- -- | Return only the tokens in a category. tokensCat :: [WordCatInfo] -> String -> [WordCatInfo] tokensCat tokens cat = let getTokCat row = snd (fst row) tokbycat = filter (\x -> ((getTokCat x) == cat)) tokens in tokbycat tokensByFeature :: [WordCatInfo] -> String -> String -> [WordCatInfo] tokensByFeature tokens tok cat = filter (\x -> ((fst x) == (tok, cat))) tokens -- -- | Count of number of features in a particular category -- Extract the first tuple to get the WordCat type and then the -- second tuple to get the category. catCount :: [WordCatInfo] -> String -> Integer catCount tokens cat = genericLength $ tokensCat tokens cat -- Find the distinct categories categories :: [WordCatInfo] -> [String] categories tokens = let getTokCat row = snd (fst row) allcats = Set.toList . Set.fromList $ [ getTokCat x | x <- tokens ] in allcats featureCount :: [WordCatInfo] -> String -> String -> Integer featureCount tokens tok cat = genericLength $ tokensByFeature tokens tok cat -- -- | Feature probality, count in this category over total in category featureProb :: [WordCatInfo] -> String -> String -> Double featureProb features tok cat = let fct = featureCount features tok cat catct = catCount features cat in (fromIntegral fct) / (fromIntegral catct) -- -- | Calcuate the category probability categoryProb :: [WordCatInfo] -> String -> String -> Double categoryProb features tok cat = initfprob / freqsum where initfprob = featureProb features tok cat freqsum = sum [ (featureProb features tok x) | x <- categories features ] weightedProb :: [WordCatInfo] -> String -> String -> Double -> Double weightedProb features tok cat weight = ((weight*ap)+(totals*initprob))/(weight+totals) where initprob = categoryProb features tok cat ap = 0.5 totals = fromIntegral $ sum [ (featureCount features tok x) | x <- categories features ] -- Inverted Chi2 formula invChi2 :: Double -> Double -> Double invChi2 chi df = minimum([snd newsum, 1.0]) where m = chi / 2.0 initsum = exp (-m) trm = exp (-m) maxrg = fromIntegral (floor (df / 2.0)) :: Double -- Return a tuple with current sum and term, given these inputs newsum = foldl (\(trm,sm) elm -> ((trm*(m/elm)), sm+trm)) (trm,initsum) [1..maxrg] fisherProb :: [WordCatInfo] -> [String] -> String -> Double fisherProb features tokens cat = invchi where initw = 1.0 p = foldl (\prb f -> (prb * (weightedProb features f cat initw))) 1.0 tokens fscore = (-2) * (log p) invchi = invChi2 fscore ((genericLength features) * 2) Some example test cases: simpleTest1 :: IO () simpleTest1 = do content <- readFile badfile let tokens = splitRegex (mkRegex "\\s*[ \t\n]+\\s*") content wordfreq = wordFreqSort tokens mapM_ (\x -> (putStrLn $ formatWordFreq x)) wordfreq putStrLn $ "Number of tokens found: " ++ (show . length $ wordfreq) simpleTest2 :: IO () simpleTest2 = do let badfreq = trainClassify "viagra is bad cialis is good" "bad" goodfreq = trainClassify "I like to run with foxes they cool" "good" allfreq = badfreq ++ goodfreq mapM_ (\x -> (putStrLn $ formatWordCat x)) allfreq simpleTest3 :: IO () simpleTest3 = do let aa = [(("1", "aa") :: (String, String), -1), (("2", "aa"), -1), (("3", "bb"), -1)] tokensAA = tokensCat aa "aa" countAA = catCount aa "aa" c = featureProb aa "1" "aa" putStrLn $ "-->" ++ (show countAA) ++ " // " ++ (show tokensAA) ++ " // " ++ (show c) simpleTest4 :: IO () simpleTest4 = do let aa = [(("dogs dogs", "good") :: (String, String), 3), (("viagra", "bad") :: (String, String), 5), (("fox", "good") :: (String, String), 2), (("dogs", "good"), 4), (("3", "bad"), 5)] bb = categories aa tokensAA = tokensByFeature aa "dogs" "good" c = featureProb aa "dogs" "good" d = catCount aa "good" x = categoryProb aa "xdogs" "good" z = weightedProb aa "dogs" "good" 1.0 putStrLn $ "-->" ++ (show d) ++ "//" ++ (show bb) ++ "//" ++ (show z) simpleTest5 :: IO () simpleTest5 = do let aa = [(("dogs dogs", "good") :: (String, String), 3), (("viagra", "bad") :: (String, String), 5), (("fox", "good") :: (String, String), 2), (("dogs", "good"), 4), (("3", "bad"), 5)] testdata = [ "xdog" ] bb = fisherProb aa testdata "bad" putStrLn $ "-->" ++ show bb Yahoo: word frequency counter Haskell Snippet, start of bayes probability library frequency counter word count Google: frequency counter word count Haskell Snippet, start of bayes probability library word frequency counter |
Related
Frequency Meter Latest
›› Decadal Occurrences Of Statewide Maximum
›› Which language has the most words?
›› MGWCC #011 -- Friday, August 15th, 2008
›› Research Tip: IICLE publications and Sma
›› The longest hyphenated word I've ever se
›› Javascript : Creating Textbox watermarks
›› The Letters We take For Granted
›› unexpected file size increase in Microso
›› "Web 2.0" is one millionth word!
›› English gets millionth word *not*
›› Counting the President's Words Tonight
›› Haskell Snippet, start of bayes probabil
›› Which language has the most words?
›› MGWCC #011 -- Friday, August 15th, 2008
›› Research Tip: IICLE publications and Sma
›› The longest hyphenated word I've ever se
›› Javascript : Creating Textbox watermarks
›› The Letters We take For Granted
›› unexpected file size increase in Microso
›› "Web 2.0" is one millionth word!
›› English gets millionth word *not*
›› Counting the President's Words Tonight
›› Haskell Snippet, start of bayes probabil
Frequency Meter Popular
›› Adding words to gain back lost territory
›› Javascript : Creating Textbox watermarks
›› Which language has the most words?
›› MGWCC #011 -- Friday, August 15th, 2008
›› unexpected file size increase in Microso
›› Yaege FC-1 portable frequency counter?
›› "Web 2.0" is one millionth word!
›› Counting the President's Words Tonight
›› The longest hyphenated word I've ever se
›› English gets millionth word *not*
›› Top 15K English words
›› New high-frequency word resources
›› Javascript : Creating Textbox watermarks
›› Which language has the most words?
›› MGWCC #011 -- Friday, August 15th, 2008
›› unexpected file size increase in Microso
›› Yaege FC-1 portable frequency counter?
›› "Web 2.0" is one millionth word!
›› Counting the President's Words Tonight
›› The longest hyphenated word I've ever se
›› English gets millionth word *not*
›› Top 15K English words
›› New high-frequency word resources
You may find: