LinkedIn.purs 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. module LinkedIn where
  2. import Prelude
  3. import Yoga.Tree
  4. import Control.Comonad.Cofree (head, tail)
  5. import Data.Array as A
  6. import Data.Generic.Rep (class Generic)
  7. import Data.List (List(..), (:))
  8. import Data.List as L
  9. import Data.List.NonEmpty as NEL
  10. import Data.List.Types (NonEmptyList)
  11. import Data.Maybe (Maybe(..), fromJust)
  12. import Data.Show.Generic (genericShow)
  13. import Data.String (Pattern(..), joinWith)
  14. import Data.String as S
  15. import Data.String.CodeUnits (fromCharArray, toCharArray)
  16. import Data.Traversable (sequence, traverse)
  17. import Effect (Effect)
  18. import Partial.Unsafe (unsafePartial)
  19. import Web.DOM (Document, Node)
  20. import Web.DOM.Document as D
  21. import Web.DOM.Element as E
  22. import Web.DOM.Node (nodeName, nodeType, textContent)
  23. import Web.DOM.Node as N
  24. import Web.DOM.NodeList as NL
  25. import Web.DOM.NodeType (NodeType(..))
  26. import Web.DOM.ParentNode (QuerySelector(..), querySelector, querySelectorAll)
  27. -- A light abstraction layer above the DOM manipulation API
  28. fromDocument ∷ Document → Node
  29. fromDocument doc = D.toNode doc
  30. queryOne :: String -> Document -> Effect (Maybe Node)
  31. queryOne selector doc = do
  32. found <- querySelector (QuerySelector selector) $ D.toParentNode doc
  33. pure case found of
  34. Nothing -> Nothing
  35. Just el -> Just $ E.toNode el
  36. queryAll :: String -> Document -> Effect (Maybe (NonEmptyList Node))
  37. queryAll selector doc = do
  38. found <- querySelectorAll (QuerySelector selector) $ D.toParentNode doc
  39. liftA1 NEL.fromFoldable $ NL.toArray found
  40. -- First pass of naming ; from here we know what we are looking for
  41. data LinkedInUIElementType = LinkedInUIArtDecoCard | LinkedInUIArtDecoTab | LinkedInUIJobsUnifiedTopCard
  42. instance Show LinkedInUIElementType where
  43. show LinkedInUIArtDecoCard = "ArtDecoCard"
  44. show LinkedInUIArtDecoTab = "ArtDecoTab"
  45. show LinkedInUIJobsUnifiedTopCard = "JobsUnifiedTopCard"
  46. data LinkedInUIElement = LinkedInUIElement LinkedInUIElementType Node
  47. instance Show LinkedInUIElement where
  48. show (LinkedInUIElement typ n) = "LinkedInUIElement(" <> show typ <> ", " <> nodeName n <> ")"
  49. getArtDecoCards ∷ Document → Effect (Maybe (NonEmptyList LinkedInUIElement))
  50. getArtDecoCards = queryAll' LinkedInUIArtDecoCard "section.artdeco-card > div ~ div > div > div > ul > li"
  51. getArtDecoTabs ∷ Document → Effect (Maybe (NonEmptyList LinkedInUIElement))
  52. getArtDecoTabs = queryAll' LinkedInUIArtDecoTab "div.artdeco-tabs > div > div > div > div > ul > li"
  53. getJobsUnifiedTopCard ∷ Document → Effect (Maybe (NonEmptyList LinkedInUIElement))
  54. getJobsUnifiedTopCard = queryAll' LinkedInUIJobsUnifiedTopCard "div.jobs-unified-top-card"
  55. queryAll' ∷ LinkedInUIElementType → String → Document → Effect (Maybe (NonEmptyList LinkedInUIElement))
  56. queryAll' constructor selector doc = do
  57. nodes <- queryAll selector doc
  58. pure case nodes of
  59. Nothing -> Nothing
  60. Just cards -> Just $ map (LinkedInUIElement constructor) cards
  61. data DetachedNode =
  62. DetachedElement {tag :: String, content :: String, id :: Maybe String, classes :: List String}
  63. | DetachedComment String
  64. | DetachedText String
  65. derive instance Generic DetachedNode _
  66. derive instance Eq DetachedNode
  67. instance Show DetachedNode where
  68. show = genericShow
  69. asTree :: LinkedInUIElement -> Effect (Tree DetachedNode)
  70. asTree (LinkedInUIElement _ n) = asTree' n
  71. asTree' :: Node -> Effect (Tree DetachedNode)
  72. asTree' n = do
  73. children <- N.childNodes n
  74. childArray <- NL.toArray children :: Effect (Array Node)
  75. detached <- toDetached n
  76. case childArray of
  77. [] -> pure $ leaf detached
  78. arr -> do
  79. a <- sequence (map asTree' arr) :: Effect (Array (Tree DetachedNode))
  80. pure $ mkTree detached a
  81. toDetached :: Node -> Effect DetachedNode
  82. toDetached node = unsafePartial $ toDetached' (nodeType node) node where
  83. toDetached' :: Partial => NodeType -> Node -> Effect DetachedNode
  84. toDetached' ElementNode n = do
  85. txt <- textContent n
  86. let
  87. el = unsafePartial $ fromJust $ E.fromNode n
  88. tag = E.tagName el
  89. id <- E.id el
  90. -- On SVG elements "className" returns a weird "SVGString" type that cannot be trimmed
  91. classStr <- if tag /= "svg" && tag /= "use" && tag /= "path" then E.className el else pure $ ""
  92. pure $ DetachedElement {
  93. tag: E.tagName el,
  94. content: normalize txt,
  95. id: if S.null id then Nothing else Just id,
  96. classes: A.toUnfoldable $ S.split (Pattern " ") (normalize classStr)
  97. }
  98. toDetached' CommentNode n = do
  99. txt <- textContent n
  100. pure $ DetachedComment $ normalize txt
  101. toDetached' TextNode n = do
  102. txt <- textContent n
  103. pure $ DetachedText $ normalize txt
  104. normalize :: String -> String
  105. normalize = normaliseSpace >>> S.trim
  106. normaliseSpace :: String -> String
  107. normaliseSpace s = fromCharArray $ A.fromFoldable $ normaliseSpace' $ A.toUnfoldable $ toCharArray s
  108. where
  109. badSequence ' ' ' ' = true
  110. badSequence ' ' '\n' = true
  111. badSequence '\n' '\n' = true
  112. badSequence '\n' ' ' = true
  113. badSequence _ _ = false
  114. normaliseSpace':: List Char -> List Char
  115. normaliseSpace' Nil = Nil
  116. normaliseSpace' (c1 : xs@(c2 : _)) | badSequence c1 c2 = normaliseSpace' xs
  117. normaliseSpace' (x:xs) = x : normaliseSpace' xs
  118. cutBranches :: forall a. (Tree a -> Boolean) -> Tree a -> Tree a
  119. cutBranches filterIn tree = case head tree, tail tree of
  120. h, [] -> mkTree h []
  121. h, t ->
  122. mkTree h (A.filter filterIn tail) where
  123. tail = map (cutBranches filterIn) t :: Array (Tree a)
  124. filterEmpty ∷ Tree DetachedNode → Boolean
  125. filterEmpty t = case head t of
  126. DetachedComment _ -> false
  127. DetachedText "" -> false
  128. DetachedElement {tag: "SPAN", classes}
  129. | L.any (_ == "white-space-pre") classes -> false
  130. DetachedElement {classes} -> L.all (\c -> c /= "visually-hidden" && c /= "a11y-text") classes
  131. _ -> true
  132. asPrunedTrees :: Maybe (NonEmptyList LinkedInUIElement) → Effect (Maybe (NonEmptyList (Tree DetachedNode)))
  133. asPrunedTrees =
  134. case _ of
  135. Nothing -> pure Nothing
  136. Just els -> do
  137. trees <- traverse asPrunedTree els
  138. pure $ Just $ trees
  139. asPrunedTree :: LinkedInUIElement → Effect (Tree DetachedNode)
  140. asPrunedTree el = do
  141. tree <- asTree el
  142. pure $ cutBranches filterEmpty tree