LinkedIn.purs 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. module LinkedIn where
  2. import Prelude
  3. import Yoga.Tree
  4. import Control.Comonad.Cofree (head, tail)
  5. import Data.Array as A
  6. import Data.List (List(..), (:))
  7. import Data.List as L
  8. import Data.List.NonEmpty as NEL
  9. import Data.List.Types (NonEmptyList)
  10. import Data.Maybe (Maybe(..), fromJust)
  11. import Data.String (Pattern(..), joinWith)
  12. import Data.String as S
  13. import Data.String.CodeUnits (fromCharArray, toCharArray)
  14. import Data.Traversable (sequence, traverse)
  15. import Effect (Effect)
  16. import Partial.Unsafe (unsafePartial)
  17. import Web.DOM (Document, Node)
  18. import Web.DOM.Document as D
  19. import Web.DOM.Element as E
  20. import Web.DOM.Node (nodeName, nodeType, textContent)
  21. import Web.DOM.Node as N
  22. import Web.DOM.NodeList as NL
  23. import Web.DOM.NodeType (NodeType(..))
  24. import Web.DOM.ParentNode (QuerySelector(..), querySelector, querySelectorAll)
  25. -- A light abstraction layer above the DOM manipulation API
  26. fromDocument ∷ Document → Node
  27. fromDocument doc = D.toNode doc
  28. queryOne :: String -> Document -> Effect (Maybe Node)
  29. queryOne selector doc = do
  30. found <- querySelector (QuerySelector selector) $ D.toParentNode doc
  31. pure case found of
  32. Nothing -> Nothing
  33. Just el -> Just $ E.toNode el
  34. queryAll :: String -> Document -> Effect (Maybe (NonEmptyList Node))
  35. queryAll selector doc = do
  36. found <- querySelectorAll (QuerySelector selector) $ D.toParentNode doc
  37. liftA1 NEL.fromFoldable $ NL.toArray found
  38. -- First pass of naming ; from here we know what we are looking for
  39. data LinkedInUIElementType = LinkedInUIArtDecoCard | LinkedInUIArtDecoTab | LinkedInUIJobsUnifiedTopCard
  40. instance Show LinkedInUIElementType where
  41. show LinkedInUIArtDecoCard = "ArtDecoCard"
  42. show LinkedInUIArtDecoTab = "ArtDecoTab"
  43. show LinkedInUIJobsUnifiedTopCard = "JobsUnifiedTopCard"
  44. data LinkedInUIElement = LinkedInUIElement LinkedInUIElementType Node
  45. instance Show LinkedInUIElement where
  46. show (LinkedInUIElement typ n) = "LinkedInUIElement(" <> show typ <> ", " <> nodeName n <> ")"
  47. getArtDecoCards ∷ Document → Effect (Maybe (NonEmptyList LinkedInUIElement))
  48. getArtDecoCards = queryAll' LinkedInUIArtDecoCard "section.artdeco-card > div ~ div > div > div > ul > li"
  49. getArtDecoTabs ∷ Document → Effect (Maybe (NonEmptyList LinkedInUIElement))
  50. getArtDecoTabs = queryAll' LinkedInUIArtDecoTab "div.artdeco-tabs > div > div > div > div > ul > li"
  51. getJobsUnifiedTopCard ∷ Document → Effect (Maybe (NonEmptyList LinkedInUIElement))
  52. getJobsUnifiedTopCard = queryAll' LinkedInUIJobsUnifiedTopCard "div.jobs-unified-top-card"
  53. queryAll' ∷ LinkedInUIElementType → String → Document → Effect (Maybe (NonEmptyList LinkedInUIElement))
  54. queryAll' constructor selector doc = do
  55. nodes <- queryAll selector doc
  56. pure case nodes of
  57. Nothing -> Nothing
  58. Just cards -> Just $ map (LinkedInUIElement constructor) cards
  59. data DetachedNode =
  60. DetachedElement {tag :: String, content :: String, id :: Maybe String, classes :: List String}
  61. | DetachedComment String
  62. | DetachedText String
  63. instance Show DetachedNode where
  64. show (DetachedElement n) = "DetachedElement(" <> n.tag <> id' <> classes' <> ")"
  65. where
  66. id' = case n.id of
  67. Nothing -> ""
  68. Just i -> "#" <> i
  69. classes' = joinWith " " $ A.fromFoldable (map (\c -> "." <> c) n.classes)
  70. show (DetachedComment c) = "DetachedComment(" <> c <> ")"
  71. show (DetachedText t) = "DetachedText(" <> t <> ")"
  72. asTree :: LinkedInUIElement -> Effect (Tree DetachedNode)
  73. asTree (LinkedInUIElement _ n) = asTree' n
  74. asTree' :: Node -> Effect (Tree DetachedNode)
  75. asTree' n = do
  76. children <- N.childNodes n
  77. childArray <- NL.toArray children :: Effect (Array Node)
  78. detached <- toDetached n
  79. case childArray of
  80. [] -> pure $ leaf detached
  81. arr -> do
  82. a <- sequence (map asTree' arr) :: Effect (Array (Tree DetachedNode))
  83. pure $ mkTree detached a
  84. toDetached :: Node -> Effect DetachedNode
  85. toDetached node = unsafePartial $ toDetached' (nodeType node) node where
  86. toDetached' :: Partial => NodeType -> Node -> Effect DetachedNode
  87. toDetached' ElementNode n = do
  88. txt <- textContent n
  89. let
  90. el = unsafePartial $ fromJust $ E.fromNode n
  91. tag = E.tagName el
  92. id <- E.id el
  93. -- On SVG elements "className" returns a weird "SVGString" type that cannot be trimmed
  94. classStr <- if tag /= "svg" && tag /= "use" && tag /= "path" then E.className el else pure $ ""
  95. pure $ DetachedElement {
  96. tag: E.tagName el,
  97. content: normalize txt,
  98. id: if S.null id then Nothing else Just id,
  99. classes: A.toUnfoldable $ S.split (Pattern " ") (normalize classStr)
  100. }
  101. toDetached' CommentNode n = do
  102. txt <- textContent n
  103. pure $ DetachedComment $ normalize txt
  104. toDetached' TextNode n = do
  105. txt <- textContent n
  106. pure $ DetachedText $ normalize txt
  107. normalize :: String -> String
  108. normalize = normaliseSpace >>> S.trim
  109. normaliseSpace :: String -> String
  110. normaliseSpace s = fromCharArray $ A.fromFoldable $ normaliseSpace' $ A.toUnfoldable $ toCharArray s
  111. where
  112. badSequence ' ' ' ' = true
  113. badSequence ' ' '\n' = true
  114. badSequence '\n' '\n' = true
  115. badSequence '\n' ' ' = true
  116. badSequence _ _ = false
  117. normaliseSpace':: List Char -> List Char
  118. normaliseSpace' Nil = Nil
  119. normaliseSpace' (c1 : xs@(c2 : _)) | badSequence c1 c2 = normaliseSpace' xs
  120. normaliseSpace' (x:xs) = x : normaliseSpace' xs
  121. cutBranches :: forall a. (Tree a -> Boolean) -> Tree a -> Tree a
  122. cutBranches filterIn tree = case head tree, tail tree of
  123. h, [] -> mkTree h []
  124. h, t ->
  125. mkTree h (A.filter filterIn tail) where
  126. tail = map (cutBranches filterIn) t :: Array (Tree a)
  127. filterEmpty ∷ Tree DetachedNode → Boolean
  128. filterEmpty t = case head t of
  129. DetachedComment _ -> false
  130. DetachedText "" -> false
  131. DetachedElement {tag: "SPAN", classes}
  132. | L.any (_ == "white-space-pre") classes -> false
  133. DetachedElement {classes} -> L.all (\c -> c /= "visually-hidden" && c /= "a11y-text") classes
  134. _ -> true
  135. asPrunedTrees :: Maybe (NonEmptyList LinkedInUIElement) → Effect (Maybe (NonEmptyList (Tree DetachedNode)))
  136. asPrunedTrees =
  137. case _ of
  138. Nothing -> pure Nothing
  139. Just els -> do
  140. trees <- traverse asPrunedTree els
  141. pure $ Just $ trees
  142. asPrunedTree :: LinkedInUIElement → Effect (Tree DetachedNode)
  143. asPrunedTree el = do
  144. tree <- asTree el
  145. pure $ cutBranches filterEmpty tree