11{-# LANGUAGE RecordWildCards, NamedFieldPuns, RecursiveDo #-}
2+ {-# LANGUAGE TupleSections #-}
23
34module Distribution.Server.Features.Sitemap (
45 SitemapFeature (.. )
@@ -25,7 +26,25 @@ import Data.ByteString.Lazy (ByteString)
2526import Data.Time.Clock (UTCTime (.. ), getCurrentTime )
2627import Data.Time.Calendar (showGregorian )
2728import Network.URI
29+ import Control.DeepSeq
30+ import Text.Read
31+ import Data.List.Split
32+ import Distribution.Server.Framework.BlobStorage
33+ import Distribution.Server.Features.TarIndexCache
34+ import qualified Data.TarIndex as Tar
35+ import System.FilePath (takeExtension )
2836
37+ data Sitemap
38+ = Sitemap
39+ { sitemapIndex :: XMLResponse
40+ , sitemaps :: [XMLResponse ]
41+ }
42+
43+ instance NFData Sitemap where
44+ rnf (Sitemap i s) = rnf i `seq` rnf s
45+
46+ instance MemSize Sitemap where
47+ memSize (Sitemap i s) = memSize2 i s
2948
3049data SitemapFeature = SitemapFeature {
3150 sitemapFeatureInterface :: HackageFeature
@@ -38,6 +57,7 @@ initSitemapFeature :: ServerEnv
3857 -> IO ( CoreFeature
3958 -> DocumentationFeature
4059 -> TagsFeature
60+ -> TarIndexCacheFeature
4161 -> IO SitemapFeature )
4262
4363initSitemapFeature env@ ServerEnv { serverCacheDelay,
@@ -46,10 +66,11 @@ initSitemapFeature env@ServerEnv{ serverCacheDelay,
4666
4767 return $ \ coref@ CoreFeature {.. }
4868 docsCore@ DocumentationFeature {.. }
49- tagsf@ TagsFeature {.. } -> do
69+ tagsf@ TagsFeature {.. }
70+ tarf@ TarIndexCacheFeature {.. } -> do
5071
5172 rec let (feature, updateSitemapCache) =
52- sitemapFeature env coref docsCore tagsf
73+ sitemapFeature env coref docsCore tagsf tarf
5374 initTime sitemapCache
5475
5576 sitemapCache <- newAsyncCacheNF updateSitemapCache
@@ -66,63 +87,85 @@ sitemapFeature :: ServerEnv
6687 -> CoreFeature
6788 -> DocumentationFeature
6889 -> TagsFeature
90+ -> TarIndexCacheFeature
6991 -> UTCTime
70- -> AsyncCache XMLResponse
71- -> (SitemapFeature , IO XMLResponse )
92+ -> AsyncCache Sitemap
93+ -> (SitemapFeature , IO Sitemap )
7294sitemapFeature ServerEnv {.. }
7395 CoreFeature {.. }
7496 DocumentationFeature {.. }
7597 TagsFeature {.. }
98+ TarIndexCacheFeature {cachedTarIndex}
7699 initTime
77100 sitemapCache
78101 = (SitemapFeature {.. }, updateSitemapCache)
79102 where
80103
81104 sitemapFeatureInterface = (emptyHackageFeature " sitemap" ) {
82- featureResources = [ xmlSitemapResource ]
105+ featureResources = [ xmlSitemapIndexResource, xmlSitemapResource ]
83106 , featureState = []
84- , featureDesc = " Provides a sitemap.xml for search engines"
107+ , featureDesc = " Provides sitemap for search engines"
85108 , featureCaches =
86109 [ CacheComponent {
87- cacheDesc = " sitemap.xml " ,
110+ cacheDesc = " sitemap" ,
88111 getCacheMemSize = memSize <$> readAsyncCache sitemapCache
89112 }
90113 ]
91114 , featurePostInit = do
92115 syncAsyncCache sitemapCache
93116 addCronJob serverCron CronJob {
94- cronJobName = " regenerate the cached sitemap.xml " ,
117+ cronJobName = " regenerate the cached sitemap" ,
95118 cronJobFrequency = DailyJobFrequency ,
96119 cronJobOneShot = False ,
97120 cronJobAction = prodAsyncCache sitemapCache " cron"
98121 }
99122 }
100123
124+ xmlSitemapIndexResource :: Resource
125+ xmlSitemapIndexResource = (resourceAt " /sitemap_index.xml" ) {
126+ resourceDesc = [(GET , " The dynamically generated sitemap index, in XML format" )]
127+ , resourceGet = [(" xml" , serveSitemapIndex)]
128+ }
129+
101130 xmlSitemapResource :: Resource
102- xmlSitemapResource = (resourceAt " /sitemap.xml " ) {
131+ xmlSitemapResource = (resourceAt " /sitemap/:filename " ) {
103132 resourceDesc = [(GET , " The dynamically generated sitemap, in XML format" )]
104133 , resourceGet = [(" xml" , serveSitemap)]
105134 }
106135
107- serveSitemap :: DynamicPath -> ServerPartE Response
108- serveSitemap _ = do
109- sitemapXML <- liftIO $ readAsyncCache sitemapCache
136+ serveSitemapIndex :: DynamicPath -> ServerPartE Response
137+ serveSitemapIndex _ = do
138+ Sitemap { .. } <- liftIO $ readAsyncCache sitemapCache
110139 cacheControlWithoutETag [Public , maxAgeDays 1 ]
111- return (toResponse sitemapXML)
140+ return (toResponse sitemapIndex)
141+
142+ serveSitemap :: DynamicPath -> ServerPartE Response
143+ serveSitemap dpath =
144+ case lookup " filename" dpath of
145+ Just filename
146+ | [basename, " xml" ] <- splitOn " ." filename
147+ , Just i <- readMaybe basename -> do
148+ Sitemap {.. } <- liftIO $ readAsyncCache sitemapCache
149+ guard (i < length sitemaps)
150+ cacheControlWithoutETag [Public , maxAgeDays 1 ]
151+ return (toResponse (sitemaps !! i))
152+ _ -> mzero
112153
113154 -- Generates a list of sitemap entries corresponding to hackage pages, then
114155 -- builds and returns an XML sitemap.
115- updateSitemapCache :: IO XMLResponse
156+ updateSitemapCache :: IO Sitemap
116157 updateSitemapCache = do
117158
118159 alltags <- queryGetTagList
119160 pkgIndex <- queryGetPackageIndex
120161 docIndex <- queryDocumentationIndex
121162
122- let sitemap = generateSitemap serverBaseURI pageBuildDate
163+ sitemaps <- generateSitemap serverBaseURI pageBuildDate
123164 (map fst alltags)
124- pkgIndex docIndex
125- return (XMLResponse sitemap)
165+ pkgIndex docIndex cachedTarIndex
166+ let uriScheme i = " /sitemap/" <> show i <> " .xml"
167+ sitemapIndex = renderSitemapIndex serverBaseURI (map uriScheme [0 .. (length sitemaps - 1 )])
168+ return $ Sitemap (XMLResponse sitemapIndex) (map XMLResponse sitemaps)
126169
127170 pageBuildDate :: T. Text
128171 pageBuildDate = T. pack (showGregorian (utctDay initTime))
@@ -131,19 +174,21 @@ generateSitemap :: URI
131174 -> T. Text
132175 -> [Tag ]
133176 -> PackageIndex. PackageIndex PkgInfo
134- -> Map. Map PackageId a
135- -> ByteString
136- generateSitemap serverBaseURI pageBuildDate alltags pkgIndex docIndex =
137- renderSitemap serverBaseURI allEntries
177+ -> Map. Map PackageId BlobId
178+ -> (BlobId -> IO Tar. TarIndex )
179+ -> IO [ByteString ]
180+ generateSitemap serverBaseURI pageBuildDate alltags pkgIndex docIndex cachedTarIndex = do
181+ versionedDocSubEntries <- versionedDocSubEntriesIO
182+ let -- Combine and build sitemap
183+ allEntries = miscEntries
184+ ++ tagEntries
185+ ++ nameEntries
186+ ++ nameVersEntries
187+ ++ baseDocEntries
188+ ++ versionedDocEntries
189+ ++ versionedDocSubEntries
190+ pure $ renderSitemap serverBaseURI <$> chunksOf 50000 allEntries
138191 where
139- -- Combine and build sitemap
140- allEntries = miscEntries
141- ++ tagEntries
142- ++ nameEntries
143- ++ nameVersEntries
144- ++ baseDocEntries
145- ++ versionedDocEntries
146-
147192 -- Misc. pages
148193 -- e.g. ["http://myhackage.com/index", ...]
149194 miscEntries = urlsToSitemapEntries miscPages pageBuildDate Weekly 0.75
@@ -224,3 +269,26 @@ generateSitemap serverBaseURI pageBuildDate alltags pkgIndex docIndex =
224269 , Map. member (packageId pkg) docIndex
225270 ]
226271 pageBuildDate Monthly 0.25
272+
273+ -- Versioned doc pages in subdirectories
274+ -- versionedSubDocURIs :: [path :: String]
275+ -- e.g. ["http://myhackage.com/packages/mypackage-1.0.2/docs/Lib.html", ...]
276+ versionedDocSubEntriesIO = do
277+ let pkgs = [ (pkg , blob)
278+ | pkg <- concat pkgss
279+ , Just blob <- [Map. lookup (packageId pkg) docIndex]
280+ ]
281+ pkgIndices <- traverse (\ (pkg, blob) -> (pkg,) <$> cachedTarIndex blob) pkgs
282+ pure $ urlsToSitemapEntries
283+ [ prefixPkgURI ++ display (packageId pkg) ++ " /docs" ++ fp
284+ | (pkg, tarIndex) <- pkgIndices
285+ , Just tar <- [Tar. lookup tarIndex " " ]
286+ , fp <- entryToPaths " /" tar
287+ , takeExtension fp == " .html"
288+ ]
289+ pageBuildDate Monthly 0.25
290+
291+ entryToPaths :: FilePath -> Tar. TarIndexEntry -> [FilePath ]
292+ entryToPaths _ (Tar. TarFileEntry _) = []
293+ entryToPaths base (Tar. TarDir content) = map ((base </> ) . fst ) content ++
294+ [ file | (folder, entry) <- content, file <- entryToPaths (base </> folder) entry ]
0 commit comments