From 39b340faf35b1d59aae37b13b4ed850d50be1d5f Mon Sep 17 00:00:00 2001 From: Ruben Rodriguez Date: Fri, 17 Jul 2015 14:10:11 -0500 Subject: Parse Sources file to extract information about the source tarball url --- export.py | 12 ++++++++++++ load_sources.py | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 load_sources.py diff --git a/export.py b/export.py index 8bd72bd..3a93602 100644 --- a/export.py +++ b/export.py @@ -52,6 +52,7 @@ class PkgData(object): pkg_store = pd.HDFStore('pkg.h5') self.pkgs = pkg_store['packages'] self.descs = pkg_store['descriptions'] + self.srcs = pkg_store['sources'] pkg_store.close() cp_store = pd.HDFStore('cp.h5') @@ -224,6 +225,15 @@ def export_srcpkgs(data, name, srcpkg_names): for srcpkg in srcpkg_names]) versions = {} + srcfiles="" + for srcpkg in srcpkg_names: + srcfile = data.srcs[data.srcs['_srcpkg'] == srcpkg]['srcfile'].values[0] + letter = srcfile[0] + if srcfile[:3] == 'lib': + letter = srcfile[:4] + srcfile = 'http://ftp.debian.org/debian/pool/main/%s/%s/%s' % (letter,srcpkg,srcfile) + srcfiles = srcfiles + srcfile + " " + for (_i, pkg) in binpkgs.iterrows(): versions[pkg['_srcpkg']] = pkg['Version'] @@ -267,6 +277,8 @@ def export_srcpkgs(data, name, srcpkg_names): ('Computer languages', ', '.join(langs)), ('Status', ''), ('Is GNU', 'No'), + ('Version identifier', pkg['Version']), + ('Version download', srcfiles), ('Submitted by', 'Debian import'), ('Submitted date', today())]) diff --git a/load_sources.py b/load_sources.py new file mode 100644 index 0000000..f420796 --- /dev/null +++ b/load_sources.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys + +import debian.deb822 +import pandas as pd + +def pkg(para): + d = dict(para) + a = dict() + + if 'Files' in d: + a['srcfile']=d['Files'].split('\n')[2].split(' ')[3] + + if 'Source' in d: + # Source fields sometimes have the source version number; strip it. + a['_srcpkg'] = d['Source'].split(' ')[0] + else: + # No 'Source' field means that it has the same value as the 'Package' + # field. + a['_srcpkg'] = d['Package'] + + return a + +if __name__ == '__main__': + packages = debian.deb822.Packages.iter_paragraphs(sys.stdin) + df = pd.DataFrame([pkg(p) for p in packages]) + store = pd.HDFStore('pkg.h5') + + print df + + store = pd.HDFStore('pkg.h5') + store['sources'] = df + store.close() + -- cgit v1.2.3