From 39b340faf35b1d59aae37b13b4ed850d50be1d5f Mon Sep 17 00:00:00 2001 From: Ruben Rodriguez Date: Fri, 17 Jul 2015 14:10:11 -0500 Subject: Parse Sources file to extract information about the source tarball url --- load_sources.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 load_sources.py (limited to 'load_sources.py') diff --git a/load_sources.py b/load_sources.py new file mode 100644 index 0000000..f420796 --- /dev/null +++ b/load_sources.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys + +import debian.deb822 +import pandas as pd + +def pkg(para): + d = dict(para) + a = dict() + + if 'Files' in d: + a['srcfile']=d['Files'].split('\n')[2].split(' ')[3] + + if 'Source' in d: + # Source fields sometimes have the source version number; strip it. + a['_srcpkg'] = d['Source'].split(' ')[0] + else: + # No 'Source' field means that it has the same value as the 'Package' + # field. + a['_srcpkg'] = d['Package'] + + return a + +if __name__ == '__main__': + packages = debian.deb822.Packages.iter_paragraphs(sys.stdin) + df = pd.DataFrame([pkg(p) for p in packages]) + store = pd.HDFStore('pkg.h5') + + print df + + store = pd.HDFStore('pkg.h5') + store['sources'] = df + store.close() + -- cgit v1.2.3