LinkedIn.purs 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
  1. module LinkedIn where
  2. import Prelude
  3. import Yoga.Tree
  4. import Control.Comonad.Cofree (head, tail)
  5. import Data.Array as A
  6. import Data.List (List(..), (:))
  7. import Data.List as L
  8. import Data.List.NonEmpty as NEL
  9. import Data.List.Types (NonEmptyList)
  10. import Data.Maybe (Maybe(..), fromJust)
  11. import Data.String (Pattern(..), joinWith)
  12. import Data.String as S
  13. import Data.String.CodeUnits (fromCharArray, toCharArray)
  14. import Data.Traversable (sequence, traverse)
  15. import Data.Tuple (Tuple(..))
  16. import Effect (Effect)
  17. import Partial.Unsafe (unsafePartial)
  18. import Web.DOM (Document, Element, Node)
  19. import Web.DOM.Document (origin)
  20. import Web.DOM.Document as D
  21. import Web.DOM.Element as E
  22. import Web.DOM.Node (nodeName, nodeType, textContent)
  23. import Web.DOM.Node as N
  24. import Web.DOM.NodeList as NL
  25. import Web.DOM.NodeType (NodeType(..))
  26. import Web.DOM.ParentNode (QuerySelector(..), querySelector, querySelectorAll)
  27. import Yoga.Tree.Zipper as Z
  28. -- A light abstraction layer above the DOM manipulation API
  29. fromDocument ∷ Document → Node
  30. fromDocument doc = D.toNode doc
  31. queryOne :: String -> Document -> Effect (Maybe Node)
  32. queryOne selector doc = do
  33. found <- querySelector (QuerySelector selector) $ D.toParentNode doc
  34. pure case found of
  35. Nothing -> Nothing
  36. Just el -> Just $ E.toNode el
  37. queryAll :: String -> Document -> Effect (Maybe (NonEmptyList Node))
  38. queryAll selector doc = do
  39. found <- querySelectorAll (QuerySelector selector) $ D.toParentNode doc
  40. liftA1 NEL.fromFoldable $ NL.toArray found
  41. -- First pass of naming ; from here we know what we are looking for
  42. data LinkedInUIElementType = LinkedInUIArtDecoCard | LinkedInUIArtDecoTab | LinkedInUIJobsUnifiedTopCard
  43. instance Show LinkedInUIElementType where
  44. show LinkedInUIArtDecoCard = "ArtDecoCard"
  45. show LinkedInUIArtDecoTab = "ArtDecoTab"
  46. show LinkedInUIJobsUnifiedTopCard = "JobsUnifiedTopCard"
  47. data LinkedInUIElement = LinkedInUIElement LinkedInUIElementType Node
  48. instance Show LinkedInUIElement where
  49. show (LinkedInUIElement typ n) = "LinkedInUIElement(" <> show typ <> ", " <> nodeName n <> ")"
  50. getArtDecoCards ∷ Document → Effect (Maybe (NonEmptyList LinkedInUIElement))
  51. getArtDecoCards = queryAll' LinkedInUIArtDecoCard "section.artdeco-card > div ~ div > div > div > ul > li"
  52. getArtDecoTabs ∷ Document → Effect (Maybe (NonEmptyList LinkedInUIElement))
  53. getArtDecoTabs = queryAll' LinkedInUIArtDecoTab "div.artdeco-tabs > div > div > div > div > ul > li"
  54. getJobsUnifiedTopCard ∷ Document → Effect (Maybe (NonEmptyList LinkedInUIElement))
  55. getJobsUnifiedTopCard = queryAll' LinkedInUIJobsUnifiedTopCard "div.jobs-unified-top-card"
  56. queryAll' ∷ LinkedInUIElementType → String → Document → Effect (Maybe (NonEmptyList LinkedInUIElement))
  57. queryAll' constructor selector doc = do
  58. nodes <- queryAll selector doc
  59. pure case nodes of
  60. Nothing -> Nothing
  61. Just cards -> Just $ map (LinkedInUIElement constructor) cards
  62. data DetachedNode =
  63. DetachedElement {tag :: String, content :: String, id :: Maybe String, classes :: List String}
  64. | DetachedComment String
  65. | DetachedText String
  66. instance Show DetachedNode where
  67. show (DetachedElement n) = "DetachedElement(" <> n.tag <> id' <> classes' <> ")"
  68. where
  69. id' = case n.id of
  70. Nothing -> ""
  71. Just i -> "#" <> i
  72. classes' = joinWith " " $ A.fromFoldable (map (\c -> "." <> c) n.classes)
  73. show (DetachedComment c) = "DetachedComment(" <> c <> ")"
  74. show (DetachedText t) = "DetachedText(" <> t <> ")"
  75. asTree :: LinkedInUIElement -> Effect (Tree DetachedNode)
  76. asTree (LinkedInUIElement _ n) = asTree' n
  77. asTree' :: Node -> Effect (Tree DetachedNode)
  78. asTree' n = do
  79. children <- N.childNodes n
  80. childArray <- NL.toArray children :: Effect (Array Node)
  81. detached <- toDetached n
  82. case childArray of
  83. [] -> pure $ leaf detached
  84. arr -> do
  85. a <- sequence (map asTree' arr) :: Effect (Array (Tree DetachedNode))
  86. pure $ mkTree detached a
  87. toDetached :: Node -> Effect DetachedNode
  88. toDetached node = unsafePartial $ toDetached' (nodeType node) node where
  89. toDetached' :: Partial => NodeType -> Node -> Effect DetachedNode
  90. toDetached' ElementNode n = do
  91. txt <- textContent n
  92. let
  93. el = unsafePartial $ fromJust $ E.fromNode n
  94. tag = E.tagName el
  95. id <- E.id el
  96. -- On SVG elements "className" returns a weird "SVGString" type that cannot be trimmed
  97. classStr <- if tag /= "svg" && tag /= "use" && tag /= "path" then E.className el else pure $ ""
  98. pure $ DetachedElement {
  99. tag: E.tagName el,
  100. content: normalize txt,
  101. id: if S.null id then Nothing else Just id,
  102. classes: A.toUnfoldable $ S.split (Pattern " ") (normalize classStr)
  103. }
  104. toDetached' CommentNode n = do
  105. txt <- textContent n
  106. pure $ DetachedComment $ normalize txt
  107. toDetached' TextNode n = do
  108. txt <- textContent n
  109. pure $ DetachedText $ normalize txt
  110. normalize :: String -> String
  111. normalize = normaliseSpace >>> S.trim
  112. normaliseSpace :: String -> String
  113. normaliseSpace s = fromCharArray $ A.fromFoldable $ normaliseSpace' $ A.toUnfoldable $ toCharArray s
  114. where
  115. badSequence ' ' ' ' = true
  116. badSequence ' ' '\n' = true
  117. badSequence '\n' '\n' = true
  118. badSequence '\n' ' ' = true
  119. badSequence _ _ = false
  120. normaliseSpace':: List Char -> List Char
  121. normaliseSpace' Nil = Nil
  122. normaliseSpace' (c1 : xs@(c2 : _)) | badSequence c1 c2 = normaliseSpace' xs
  123. normaliseSpace' (x:xs) = x : normaliseSpace' xs
  124. cutBranches :: forall a. (Tree a -> Boolean) -> Tree a -> Tree a
  125. cutBranches filterIn tree = case head tree, tail tree of
  126. h, [] -> mkTree h []
  127. h, t ->
  128. mkTree h (A.filter filterIn tail) where
  129. tail = map (cutBranches filterIn) t :: Array (Tree a)
  130. filterEmpty ∷ Tree DetachedNode → Boolean
  131. filterEmpty t = case head t of
  132. DetachedComment _ -> false
  133. DetachedText "" -> false
  134. DetachedElement {tag: "SPAN", classes}
  135. | L.any (_ == "white-space-pre") classes -> false
  136. DetachedElement {classes} -> L.all (\c -> c /= "visually-hidden" && c /= "a11y-text") classes
  137. _ -> true
  138. asPrunedTrees :: Maybe (NonEmptyList LinkedInUIElement) → Effect (Maybe (NonEmptyList (Tree DetachedNode)))
  139. asPrunedTrees =
  140. case _ of
  141. Nothing -> pure Nothing
  142. Just els -> do
  143. trees <- traverse asPrunedTree els
  144. pure $ Just $ trees
  145. asPrunedTree :: LinkedInUIElement → Effect (Tree DetachedNode)
  146. asPrunedTree el = do
  147. tree <- asTree el
  148. pure $ cutBranches filterEmpty tree