diff options
Diffstat (limited to 'load_sources.py')
-rw-r--r-- | load_sources.py | 36 |
1 files changed, 36 insertions, 0 deletions
diff --git a/load_sources.py b/load_sources.py new file mode 100644 index 0000000..f420796 --- /dev/null +++ b/load_sources.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys + +import debian.deb822 +import pandas as pd + +def pkg(para): + d = dict(para) + a = dict() + + if 'Files' in d: + a['srcfile']=d['Files'].split('\n')[2].split(' ')[3] + + if 'Source' in d: + # Source fields sometimes have the source version number; strip it. + a['_srcpkg'] = d['Source'].split(' ')[0] + else: + # No 'Source' field means that it has the same value as the 'Package' + # field. + a['_srcpkg'] = d['Package'] + + return a + +if __name__ == '__main__': + packages = debian.deb822.Packages.iter_paragraphs(sys.stdin) + df = pd.DataFrame([pkg(p) for p in packages]) + store = pd.HDFStore('pkg.h5') + + print df + + store = pd.HDFStore('pkg.h5') + store['sources'] = df + store.close() + |