diff options
author | tv <tv@shackspace.de> | 2014-11-18 23:08:07 +0100 |
---|---|---|
committer | tv <tv@shackspace.de> | 2014-11-18 23:08:07 +0100 |
commit | 8908c3bfaf10277de632c98e70ea9e988f0c7471 (patch) | |
tree | 771f2598739df6faeab6acf07c1815fe78ad4128 /Regfish | |
parent | 64ec0daf8937202092e887c80bc86f2e6e54fe7f (diff) |
Scrape zone with hxt.
Diffstat (limited to 'Regfish')
-rw-r--r-- | Regfish/Parser.hs | 152 | ||||
-rw-r--r-- | Regfish/Types.hs | 11 |
2 files changed, 93 insertions, 70 deletions
diff --git a/Regfish/Parser.hs b/Regfish/Parser.hs index bc294b6..a1af5a5 100644 --- a/Regfish/Parser.hs +++ b/Regfish/Parser.hs @@ -3,96 +3,110 @@ module Regfish.Parser - ( readZone + ( readZoneIO ) where +import Control.Applicative +import qualified Data.List as L import qualified Data.ByteString.Lazy.Char8 as LBS8 import Regfish.Types -import Text.HTML.TagSoup +import Text.HandsomeSoup +import Text.XML.HXT.Core + + +normalizeId :: String -> String +normalizeId = \case + 'a':'_':xs -> xs + 'r':'r':'_':xs -> drop 1 . dropWhile (/='_') $ xs + xs -> xs + + +normalizeRR :: IOSLA (XIOState ()) XmlTree XmlTree +normalizeRR = + processAttrl (changeAttrValue normalizeId `when` hasName "id") + <<< (filterA $ neg $ hasAttr "nowrap") + <<< (removeAttr "style") + + +readZoneIO :: LBS8.ByteString -> IO [RR] +readZoneIO b = + concat <$> mapM (flip readRRs $ LBS8.unpack b) + [ "A" + , "AAAA" + , "ALIAS" + , "CNAME" + , "MX" + , "NS" + , "TXT" + , "SRV" + ] + + +readRRs :: String -> String -> IO [RR] +readRRs rrtype s = do + let --body = LBS8.unpack b -- $ r ^. W.responseBody + doc = parseHtml s + t = processTopDown normalizeRR + <<< removeAllWhiteSpace' + <<< css ("." ++ rrtype) + <<< doc + + -- t also contains <td>IN</td> and <td id="type">TXT</td> + rrid <- runX $ getAttrValue "id" <<< t + rrname <- runX $ css "#name" //> getText <<< t + rrttl <- runX $ css "#ttl" //> getText <<< t + rraux <- runX $ css "#aux" //> getText <<< t + rrdata <- runX $ css "#data" //> getText <<< t + return $ map (parse rrtype) $ + L.transpose [rrid, rrname, rrttl, rraux, rrdata] - -readZone :: LBS8.ByteString -> [Record] -readZone = parseRecords . dnszone - - -parseRecords :: [Tag LBS8.ByteString] -> [Record] -parseRecords = - map toRecord . filter knownRecord . rec [] where - knownRecord (_ : _ : _ : namespace : type_ : _) = - namespace == "IN" && type_ `elem` ["MX", "A", "CNAME"] - knownRecord x = - errorBadArgument "parseRecords.knownRecord" x + -- TODO note that "0"-values should be different - rec ys = \case - (TagOpen "tr" attrs:xs) -> - case getRecordId attrs of - Just id_ -> - let (r,_:xs') = span (/=TagClose "tr") xs - in rec (parseRecord id_ r:ys) xs' - _ -> - let xs' = drop 1 $ dropWhile (/=TagClose "tr") xs - in rec ys xs' + parse "A" [rrid, rrname, rrttl, rrdata] = + RR rrid rrname (read rrttl) (A rrdata) - [] -> ys + parse "AAAA" [rrid, rrname, rrttl, rrdata] = + RR rrid rrname (read rrttl) (AAAA rrdata) - x -> errorBadArgument "parseRecords.rec" x + parse "ALIAS" [rrid, rrname, rrttl, rrdata] = + RR rrid rrname (read rrttl) (ALIAS rrdata) - getRecordId attrs = - case maybe Nothing (Just . LBS8.unpack) $ lookup "id" attrs of - Just ('a':'_':xs) -> Just (LBS8.pack xs) - _ -> Nothing + parse "CNAME" [rrid, rrname, rrttl, rrdata] = + RR rrid rrname (read rrttl) (CNAME rrdata) + parse "MX" [rrid, rrname, rrttl, rraux, rrdata] = + RR rrid rrname (read rrttl) (MX (read rraux) rrdata) + parse "MX" [rrid, rrname, rrttl, rrdata] = + RR rrid rrname (read rrttl) (MX 0 rrdata) -toRecord :: [LBS8.ByteString] -> Record -toRecord xs = case map LBS8.unpack xs of - [rrid, rrname, rrttl, "IN", rrtype, rraux, rrdata] -> - Record rrid rrtype rrname (read rrttl) (read rraux) rrdata - [rrid, rrname, rrttl, "IN", rrtype, rrdata] -> - Record rrid rrtype rrname (read rrttl) 0 rrdata - x -> - errorBadArgument "toRecord" x + parse "NS" [rrid, rrname, rrttl, rrdata] = + RR rrid rrname (read rrttl) (NS rrdata) + parse "NS" [rrid, rrname, rrdata] = + RR rrid rrname 0 (NS rrdata) -dnszone :: LBS8.ByteString -> [Tag LBS8.ByteString] -dnszone = - filter p . (dnszoneSoup . parseTags) - where - p = \case - TagOpen "tr" _ -> True - TagClose "tr" -> True - TagText x - | x /= "\n" -> True - | otherwise -> False - _ -> False - - -parseRecord :: LBS8.ByteString -> [Tag LBS8.ByteString] -> [LBS8.ByteString] -parseRecord id_ = - (id_:) . add_def_ttl "0" . filter p . map f - where - f (TagText x) = x - f x = errorBadArgument "parseRecord" x - p x - | x == "\160" = False - | x == " " = False - | otherwise = True + parse "TXT" [rrid, rrname, rrttl, rrdata] = + RR rrid rrname (read rrttl) (TXT rrdata) + + parse "SRV" [rrid, rrname, rrttl, rrdata] = + RR rrid rrname (read rrttl) (SRV rrdata) - add_def_ttl def_ttl (x:"IN":xs) = x : def_ttl : "IN" : xs - add_def_ttl _ xs = xs + parse x y = errorBadArgument "readRRs.parse" (x, y) -dnszoneSoup :: [Tag LBS8.ByteString] -> [Tag LBS8.ByteString] -dnszoneSoup = - takeWhile (\t -> not $ t ~== dnszoneClose) . - dropWhile (\t -> not $ t ~== dnszoneOpen) + +removeAllWhiteSpace' :: ArrowXml a => a XmlTree XmlTree +removeAllWhiteSpace' = + fromLA $ editNTreeA [isWhiteSpace' :-> Text.XML.HXT.Core.none] + -- fromLA $ processBottomUp removeWhiteSpace' -- less efficient + where + isWhiteSpace' = hasText (all isXmlSpaceChar') + isXmlSpaceChar' = (`elem` " \n\t\r\160") -dnszoneOpen, dnszoneClose :: Tag LBS8.ByteString -dnszoneOpen = TagOpen "table" [("id", "dnszone")] -dnszoneClose = TagClose "table" errorBadArgument :: Show a => String -> a -> b diff --git a/Regfish/Types.hs b/Regfish/Types.hs index cef2808..c453947 100644 --- a/Regfish/Types.hs +++ b/Regfish/Types.hs @@ -65,10 +65,19 @@ data RFState = RFState makeLenses ''RFState +data RR = RR String String Integer RRData + deriving (Show) + + data RRData = A String - | MX Integer String + | AAAA String + | ALIAS String | CNAME String + | MX Integer String + | NS String + | TXT String + | SRV String deriving (Show) |