diff options
Diffstat (limited to 'load_packages.py')
-rw-r--r-- | load_packages.py | 21 |
1 files changed, 14 insertions, 7 deletions
diff --git a/load_packages.py b/load_packages.py index 423b227..e16c90e 100644 --- a/load_packages.py +++ b/load_packages.py @@ -4,17 +4,24 @@ import sys import debian.deb822 import pandas as pd +def pkg(para): + d = dict(para) + + if 'Source' in d: + # Source fields sometimes have the source version number; strip it. + d['_srcpkg'] = d['Source'].split(' ')[0] + else: + # No 'Source' field means that it has the same value as the 'Package' + # field. + d['_srcpkg'] = d['Package'] + + return d + if __name__ == '__main__': packages = debian.deb822.Packages.iter_paragraphs(sys.stdin) - df = pd.DataFrame([dict(p) for p in packages]) + df = pd.DataFrame([pkg(p) for p in packages]) store = pd.HDFStore('pkg.h5') - # No 'Source' field means that it has the same value as the 'Package' - # field. Set this explicitly. - nosrc = df['Source'].isnull() - df['Source'][nosrc] = df[nosrc]['Package'] - assert sum(pd.isnull(df['Source'])) == 0 - print df store = pd.HDFStore('pkg.h5') |