From 3b69e5b4e9f05ffeb87ad74a8d373ea84030192c Mon Sep 17 00:00:00 2001 From: Dafydd Harries Date: Mon, 18 Mar 2013 18:57:43 -0400 Subject: strip version number from Source fields --- export.py | 4 ++-- load_packages.py | 21 ++++++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/export.py b/export.py index 4bcf4f5..1b132ef 100644 --- a/export.py +++ b/export.py @@ -156,7 +156,7 @@ def export(data, name): srcpkg_names = list(pkg_cps['_srcpkg']) print srcpkg_names binpkgs = pd.concat([ - data.pkgs[data.pkgs['Source'] == srcpkg] + data.pkgs[data.pkgs['_srcpkg'] == srcpkg] for srcpkg in srcpkg_names]) binpkg_names = sorted(binpkgs['Package'], key=len) print binpkg_names @@ -241,7 +241,7 @@ def main(): if len(args) == 0: # XXX use upstream names - srcps = sorted(set(data.pkgs['Source'])) + srcps = sorted(set(data.pkgs['_srcpkg'])) for pkgname in srcps[:100]: export(data, pkgname) diff --git a/load_packages.py b/load_packages.py index 423b227..e16c90e 100644 --- a/load_packages.py +++ b/load_packages.py @@ -4,17 +4,24 @@ import sys import debian.deb822 import pandas as pd +def pkg(para): + d = dict(para) + + if 'Source' in d: + # Source fields sometimes have the source version number; strip it. + d['_srcpkg'] = d['Source'].split(' ')[0] + else: + # No 'Source' field means that it has the same value as the 'Package' + # field. + d['_srcpkg'] = d['Package'] + + return d + if __name__ == '__main__': packages = debian.deb822.Packages.iter_paragraphs(sys.stdin) - df = pd.DataFrame([dict(p) for p in packages]) + df = pd.DataFrame([pkg(p) for p in packages]) store = pd.HDFStore('pkg.h5') - # No 'Source' field means that it has the same value as the 'Package' - # field. Set this explicitly. - nosrc = df['Source'].isnull() - df['Source'][nosrc] = df[nosrc]['Package'] - assert sum(pd.isnull(df['Source'])) == 0 - print df store = pd.HDFStore('pkg.h5') -- cgit v1.2.3