Skip to content

Commit

Permalink
page ranked
Browse files Browse the repository at this point in the history
  • Loading branch information
isovector committed Jul 19, 2022
1 parent 9945616 commit 65cefcc
Show file tree
Hide file tree
Showing 10 changed files with 179 additions and 2 deletions.
6 changes: 6 additions & 0 deletions go-go.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ source-repository head

library
exposed-modules:
Data.Graph.PageRank
DB
Index
Keywords
Expand All @@ -34,6 +35,7 @@ library
Spider
Types
Utils
WebGraph
other-modules:
Paths_go_go
hs-source-dirs:
Expand All @@ -43,6 +45,7 @@ library
, base >=4.7 && <5
, bytestring
, containers
, fgl
, hasql
, http-client
, http-client-tls
Expand Down Expand Up @@ -74,6 +77,7 @@ executable go-go-exe
, base >=4.7 && <5
, bytestring
, containers
, fgl
, go-go
, hasql
, http-client
Expand Down Expand Up @@ -105,6 +109,7 @@ executable spider
, base >=4.7 && <5
, bytestring
, containers
, fgl
, go-go
, hasql
, http-client
Expand Down Expand Up @@ -137,6 +142,7 @@ test-suite go-go-test
, base >=4.7 && <5
, bytestring
, containers
, fgl
, go-go
, hasql
, http-client
Expand Down
1 change: 1 addition & 0 deletions package.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ dependencies:
- streaming-utils
- warc
- HTTP
- fgl

library:
source-dirs: src
Expand Down
5 changes: 5 additions & 0 deletions src/DB.hs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ data DiscoveryState
| Explored
| Pruned
| Errored
| Unacceptable
deriving stock (Eq, Ord, Show, Read, Enum, Bounded, Generic)
deriving (DBType, DBEq) via ReadShow DiscoveryState

Expand All @@ -60,6 +61,7 @@ data Discovery f = Discovery
, d_state :: Column f DiscoveryState
, d_depth :: Column f Int32
, d_data :: Column f ByteString
, d_rank :: Column f Double
}
deriving stock Generic
deriving anyclass Rel8able
Expand Down Expand Up @@ -111,6 +113,7 @@ CREATE TABLE IF NOT EXISTS discovery (
uri TEXT UNIQUE NOT NULL,
state VARCHAR(10) NOT NULL,
depth int4 NOT NULL,
rank float8 NOT NULL,
data bytea NOT NULL
);
Expand All @@ -126,6 +129,7 @@ discoverySchema = TableSchema
, d_state = "state"
, d_depth = "depth"
, d_data = "data"
, d_rank = "rank"
}
}

Expand Down Expand Up @@ -306,6 +310,7 @@ rootNodes = Insert
, d_state = lit Discovered
, d_depth = 0
, d_data = ""
, d_rank = 0
}
, onConflict = DoNothing
, returning = pure ()
Expand Down
65 changes: 65 additions & 0 deletions src/Data/Graph/PageRank.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
-- Sorry for the awful module; I stole it from graph-utils which no longer
-- compiles.
{-# LANGUAGE BangPatterns #-}

module Data.Graph.PageRank (pageRanks) where

import Control.Monad.RWS (RWS(..), asks, gets, execRWS, get, put)
import Data.Graph.Inductive hiding (size)
import Prelude hiding (lookup)
import Data.Map (Map, lookup, fromList, foldrWithKey, findWithDefault)
import Data.Maybe (fromJust)
import Control.Monad
import Debug.Trace (traceM)

data Env = Env {node :: [Node], size :: Int, from :: Map Node [Node], outdegrees :: Map Node Int}

-- |'RankDic' is the Map for holding PageRank data.
type PRMachine = RWS Env () (Map Node Double)

lookupEnv :: (Ord a) => (Env -> Map a b) -> a -> PRMachine b
lookupEnv f a = do{ dic<-asks f; pure $ fromJust $ lookup a dic}

outdegree :: Node -> PRMachine Int
outdegree = lookupEnv outdegrees

froms :: Node -> PRMachine [Node]
froms = lookupEnv from

currentRank :: Node -> PRMachine Double
currentRank nd = gets (fromJust.lookup nd)

-- |'pageRanks' calculate the PageRank for each node in the Graph 'gr'
pageRanks :: Graph gr => gr a b -> Double -> Double -> Map Node Double
pageRanks gr epsilon error = fst $ execRWS (steps 0) Env{node=nds, size=count, from=froms, outdegrees=outdegs} initRanks
where nds = nodes gr
count :: (Num a) => a
count = fromIntegral $ noNodes gr
froms = fromList $ zip nds $ fmap (pre gr) nds
outdegs = fromList $ zip nds $ fmap (outdeg gr) nds
initRanks = fromList $ zip nds $ replicate count (1/count)
steps 1000 = get
steps n = do
!_ <- traceM $ "stepping " <> show n
old <- get
new <- calcPageRank epsilon
let cond = foldrWithKey (\k a b -> b && ((findWithDefault (1/0) k new)-a < error)) True old
if cond then pure new else steps (n + 1)



calcPageRank :: Double -> PRMachine (Map Node Double)
calcPageRank epsilon = do
nds <- asks node
count <- asks $ fromIntegral . size
dic <- forM nds $ \n -> do
frms <- froms n
ranks <- forM frms $ \m -> do
deg <- outdegree m
rank <- currentRank m
pure (rank/fromIntegral deg)
pure (n, epsilon/count + (1-epsilon)*(sum ranks))
let rdic = fromList dic
put rdic
pure rdic

3 changes: 3 additions & 0 deletions src/Ranking.hs
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,6 @@ rank
, round $ (fromIntegral para / fromIntegral heads) * 5
]




10 changes: 10 additions & 0 deletions src/Signals.hs
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ isAcceptableLink uri
, ".tiff"
, ".epub"
, ".zip"
, ".tgz"
, ".7g"
, ".tar"
, ".doc"
Expand All @@ -143,14 +144,20 @@ isAcceptableLink uri
, "instagram.com"
, "google.com"
, "amazon.com"
, "flickr.com"
, "wp.me"
, "tiktok.com"
, "snapchat.com"
, "spoilertv.com"
, "linkedin.com"
, "tumblr.com"
, "archive.org"
, "vimeo.com"
, "tinyurl.com"
]
, not (isOnDomain (uriRegName auth) "wikipedia.org")
|| (isOnDomain (uriRegName auth) "wikipedia.org"
&& isOnDomain (uriRegName auth) "en.wikipedia.org")
]
| otherwise = False
where
Expand Down Expand Up @@ -207,3 +214,6 @@ rankStuff =
hasSticky :: Ranker Bool
hasSticky = fmap (not . null) $ chroots "div" $ withClass "div" $ T.isInfixOf "sticky"

hasModal :: Ranker Bool
hasModal = fmap (not . null) $ chroots "div" $ withClass "div" $ T.isInfixOf "modal"

3 changes: 2 additions & 1 deletion src/Spider.hs
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ discover depth uri = Insert
(lit Discovered)
(lit $ depth + 1)
(lit "")
(lit 0)
, onConflict = DoUpdate $ Upsert
{ index = d_docId
, set = \new old -> old { d_depth = leastExpr (d_depth old) (d_depth new) }
Expand Down Expand Up @@ -177,7 +178,7 @@ buildEdges conn disc ls = do
getDocs :: [WordId] -> Query (Discovery Expr)
getDocs [] = do
where_ $ true ==. false
pure $ lit $ Discovery (DocId 0) "" Discovered 0 ""
pure $ lit $ Discovery (DocId 0) "" Discovered 0 "" 0
getDocs wids = distinct $ do
d <- distinct $ foldr1 intersect $ do
wid <- wids
Expand Down
79 changes: 79 additions & 0 deletions src/WebGraph.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
{-# LANGUAGE DuplicateRecordFields #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE TypeApplications #-}

module WebGraph where

import DB
import Rel8
import Control.Applicative
import Data.Int (Int64)
import Hasql.Statement
import Hasql.Connection (Connection, acquire)
import Hasql.Session
import Data.Graph.PageRank (pageRanks)
import Data.Map (toDescList)
import Streaming (Stream, Of)
import qualified Streaming as S
import qualified Streaming.ByteString.Char8 as SB
import qualified Streaming.Prelude as S
import Data.ByteString.Streaming.HTTP (MonadResource, runResourceT)
import Data.ByteString.Char8 (ByteString)
import Data.Function ((&))
import qualified Data.Text as T
import Data.Text.Encoding (decodeUtf8)
import Data.Bifunctor (bimap)
import Control.Monad.Trans (lift)

pagerankCsv :: forall m. MonadResource m => Stream (Of (DocId, Double)) m ()
pagerankCsv
= SB.readFile @m "/home/sandy/pagerank.csv"
& SB.lines
& S.mapped SB.toStrict
& S.drop 1
& S.map (break (== ',') . T.unpack . decodeUtf8)
& S.map (bimap (DocId . read) $ read . tail)

main :: IO ()
main = do
Right conn <- acquire connectionSettings
runResourceT $ do
flip S.mapM_ (S.zip (S.each [0..]) pagerankCsv) $ \x@(n, (d, pr)) -> lift $ do
print n
flip run conn $ statement () $ update $ Update
{ target = discoverySchema
, from = pure ()
, set = const $ \e -> e { d_rank = lit pr }
, updateWhere = const $ \e -> d_docId e ==. lit d
, returning = pure ()
}

-- gr <- buildGraph conn
-- print "built graph"
-- let pr = pageRanks gr 0.1 0.1
-- print $ head $ toDescList $ pr

-- buildGraph :: Connection -> IO (Gr () ())
-- buildGraph conn = do
-- Right nodes <- flip run conn $ statement () $ select selNodes
-- print "got nodes"
-- Right edges <- flip run conn $ statement () $ select selEdges
-- print "got edges"
-- let f = fromIntegral . unDocId
-- pure $ mkGraph (fmap (\ di -> (f di, ())) nodes) (fmap (\ (src, dst) -> (f src, f dst, ())) edges)


-- selEdges :: Query (Expr DocId, Expr DocId)
-- selEdges = do
-- Edges _ src dst _ <- each edgesSchema
-- pure $ (src, dst)


-- selNodes :: Query (Expr DocId)
-- selNodes = do
-- d <- each discoverySchema
-- pure $ d_docId d


2 changes: 1 addition & 1 deletion stack.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ extra-deps:
- streaming-utils-0.2.1.0@sha256:845e19526be30c0dbfeb62fd6cbbf86889c47c5bd983273d56464444ecfbd7fe,4055
- warc-1.0.5@sha256:d5d24fd625439063998a16026fca0b62b251d07a837115aa969edfcf7031f914,2685
- json-stream-0.4.4.1@sha256:272cd4aed5d0d9c97978620c1435cd91acaa39c8bddb28cbace9284f58ddc364,4985

- graph-utils-0.3.7@sha256:aad4856e5567daa99401db2c78b90c240d9b5c9dd789438c51786d9eeabf10e8,1169
7 changes: 7 additions & 0 deletions stack.yaml.lock
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,13 @@ packages:
sha256: 1eac606b34f75ecdb76c75a9ca90be32ddc56f16b8895c2136c4d368c20cd42e
original:
hackage: json-stream-0.4.4.1@sha256:272cd4aed5d0d9c97978620c1435cd91acaa39c8bddb28cbace9284f58ddc364,4985
- completed:
hackage: graph-utils-0.3.7@sha256:aad4856e5567daa99401db2c78b90c240d9b5c9dd789438c51786d9eeabf10e8,1169
pantry-tree:
size: 433
sha256: a1a3806ca76639eba09f89d1e6f97cbcead5dac34a5e06fd49482f6a8c1f1132
original:
hackage: graph-utils-0.3.7@sha256:aad4856e5567daa99401db2c78b90c240d9b5c9dd789438c51786d9eeabf10e8,1169
snapshots:
- completed:
size: 586286
Expand Down

0 comments on commit 65cefcc

Please sign in to comment.