diff options
-rw-r--r-- | export.py | 4 | ||||
-rw-r--r-- | load_packages.py | 21 |
2 files changed, 16 insertions, 9 deletions
@@ -156,7 +156,7 @@ def export(data, name): srcpkg_names = list(pkg_cps['_srcpkg']) print srcpkg_names binpkgs = pd.concat([ - data.pkgs[data.pkgs['Source'] == srcpkg] + data.pkgs[data.pkgs['_srcpkg'] == srcpkg] for srcpkg in srcpkg_names]) binpkg_names = sorted(binpkgs['Package'], key=len) print binpkg_names @@ -241,7 +241,7 @@ def main(): if len(args) == 0: # XXX use upstream names - srcps = sorted(set(data.pkgs['Source'])) + srcps = sorted(set(data.pkgs['_srcpkg'])) for pkgname in srcps[:100]: export(data, pkgname) diff --git a/load_packages.py b/load_packages.py index 423b227..e16c90e 100644 --- a/load_packages.py +++ b/load_packages.py @@ -4,17 +4,24 @@ import sys import debian.deb822 import pandas as pd +def pkg(para): + d = dict(para) + + if 'Source' in d: + # Source fields sometimes have the source version number; strip it. + d['_srcpkg'] = d['Source'].split(' ')[0] + else: + # No 'Source' field means that it has the same value as the 'Package' + # field. + d['_srcpkg'] = d['Package'] + + return d + if __name__ == '__main__': packages = debian.deb822.Packages.iter_paragraphs(sys.stdin) - df = pd.DataFrame([dict(p) for p in packages]) + df = pd.DataFrame([pkg(p) for p in packages]) store = pd.HDFStore('pkg.h5') - # No 'Source' field means that it has the same value as the 'Package' - # field. Set this explicitly. - nosrc = df['Source'].isnull() - df['Source'][nosrc] = df[nosrc]['Package'] - assert sum(pd.isnull(df['Source'])) == 0 - print df store = pd.HDFStore('pkg.h5') |