diff options
Diffstat (limited to 'src/FSD/Download.hs')
-rw-r--r-- | src/FSD/Download.hs | 118 |
1 files changed, 118 insertions, 0 deletions
diff --git a/src/FSD/Download.hs b/src/FSD/Download.hs new file mode 100644 index 0000000..3631ddc --- /dev/null +++ b/src/FSD/Download.hs @@ -0,0 +1,118 @@ +{- +Copyright (C) 2022 Yuchen Pei. + +This file is part of fsd. + +fsd is free software: you can redistribute it and/or modify it under +the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +fsd is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General +Public License for more details. + +You should have received a copy of the GNU Affero General Public +License along with fsd. If not, see <https://www.gnu.org/licenses/>. + +-} + +{-# LANGUAGE ImportQualifiedPost #-} +{-# LANGUAGE OverloadedStrings #-} + +module FSD.Download where + +import System.FilePath +import Control.Monad +import Data.Function +import Data.List +import Data.Text (Text) +import Data.Text qualified as T +import System.Exit +import System.Process +import System.Random + +wgetCommand = "wget" + +wgetIndexFileFlags = "-O- 2>/dev/null" + +gunzipCommand = "gunzip" + +bunzip2Command = "bunzip2" + +wgetMetadataFlags = "-N -nv -x -c" + +sourcesUrl = "https://ftp.debian.org/debian/dists/stable/main/source/Sources.gz" + +packagesUrl = "https://ftp.debian.org/debian/dists/stable/main/binary-amd64/Packages.gz" + +translationUrl = "https://ftp.debian.org/debian/dists/stable/main/i18n/Translation-en.bz2" + +data MetadataType = MTChangelog | MTCopyright deriving Eq + +metadataUrlFile :: MetadataType -> Int -> String +metadataUrlFile mtype i = toFilename mtype ++ "_urls_" ++ show i + where toFilename MTChangelog = "changelog" + toFilename MTCopyright = "copyright" + +wgetIndexFileCommand :: FilePath -> (String, String) -> String +wgetIndexFileCommand outputDir (url, filename) = + wgetCommand ++ " " ++ wgetIndexFileFlags ++ " " ++ url ++ "|" ++ extractCommand ++ " >" ++ (outputDir </> filename) ++ " && echo Done: " ++ filename + where + extractCommand = if isSuffixOf ".gz" url then gunzipCommand else bunzip2Command + +downloadSources :: FilePath -> IO ExitCode +downloadSources outputDir = + waitForProcess =<< + (runCommand $ wgetIndexFileCommand outputDir (sourcesUrl, "Sources")) + +downloadPackages :: FilePath -> IO ExitCode +downloadPackages outputDir = + waitForProcess =<< + (runCommand $ wgetIndexFileCommand outputDir (packagesUrl, "Packages")) + +downloadTranslation :: FilePath -> IO ExitCode +downloadTranslation outputDir = + waitForProcess =<< + (runCommand $ + wgetIndexFileCommand outputDir (translationUrl, "Translation-en")) + +downloadMetadataFiles :: FilePath -> MetadataType -> Int -> IO [ProcessHandle] +downloadMetadataFiles root mtype nWorkers = + sequence $ + ( \i -> + runCommand + ( wgetCommand ++ " " ++ wgetMetadataFlags + ++ " -i " ++ (root </> metadataUrlFile mtype i) + ++ " -P " ++ root + ++ " && echo Done: worker " ++ show i + ) + ) + <$> [1 .. nWorkers] + +writeWgetListFiles :: FilePath -> MetadataType -> [Text] -> Int -> IO () +writeWgetListFiles outputDir mtype packages n = do + pairs <- zip [1 ..] . fmap T.unlines <$> mapM shuffle (replicate n urls) + mapM_ (\(i, list) -> + writeFile (outputDir </> metadataUrlFile mtype i) + (T.unpack list)) pairs + where + urls = metadataUrl mtype <$> packages + +shuffle :: [Text] -> IO [Text] +shuffle xs = + fmap fst . sortBy (compare `on` snd) . zip xs + <$> replicateM n (randomRIO (0, n * n)) + where + n = length xs + +metadataUrl :: MetadataType -> Text -> Text +metadataUrl mtype pkgName = + "https://metadata.ftp-master.debian.org/changelogs/main/" <> + (if T.isPrefixOf "lib" pkgName + then T.take 4 pkgName else T.take 1 pkgName) <> + "/" <> pkgName <> "/" <> toFilename mtype + where + toFilename MTChangelog = "stable_changelog" + toFilename MTCopyright = "stable_copyright" |