aboutsummaryrefslogtreecommitdiff
path: root/load_packages.py
diff options
context:
space:
mode:
Diffstat (limited to 'load_packages.py')
-rw-r--r--load_packages.py21
1 files changed, 14 insertions, 7 deletions
diff --git a/load_packages.py b/load_packages.py
index 423b227..e16c90e 100644
--- a/load_packages.py
+++ b/load_packages.py
@@ -4,17 +4,24 @@ import sys
import debian.deb822
import pandas as pd
+def pkg(para):
+ d = dict(para)
+
+ if 'Source' in d:
+ # Source fields sometimes have the source version number; strip it.
+ d['_srcpkg'] = d['Source'].split(' ')[0]
+ else:
+ # No 'Source' field means that it has the same value as the 'Package'
+ # field.
+ d['_srcpkg'] = d['Package']
+
+ return d
+
if __name__ == '__main__':
packages = debian.deb822.Packages.iter_paragraphs(sys.stdin)
- df = pd.DataFrame([dict(p) for p in packages])
+ df = pd.DataFrame([pkg(p) for p in packages])
store = pd.HDFStore('pkg.h5')
- # No 'Source' field means that it has the same value as the 'Package'
- # field. Set this explicitly.
- nosrc = df['Source'].isnull()
- df['Source'][nosrc] = df[nosrc]['Package']
- assert sum(pd.isnull(df['Source'])) == 0
-
print df
store = pd.HDFStore('pkg.h5')