aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDafydd Harries <daf@rhydd.org>2013-03-18 18:57:43 -0400
committerDafydd Harries <daf@rhydd.org>2013-03-18 18:57:43 -0400
commit3b69e5b4e9f05ffeb87ad74a8d373ea84030192c (patch)
treeb84392fec777eb9cc44d1faa57a1296e3c44263b
parent51d82cb78c436a17ef1ffcf6ba4638183fad765a (diff)
strip version number from Source fields
-rw-r--r--export.py4
-rw-r--r--load_packages.py21
2 files changed, 16 insertions, 9 deletions
diff --git a/export.py b/export.py
index 4bcf4f5..1b132ef 100644
--- a/export.py
+++ b/export.py
@@ -156,7 +156,7 @@ def export(data, name):
srcpkg_names = list(pkg_cps['_srcpkg'])
print srcpkg_names
binpkgs = pd.concat([
- data.pkgs[data.pkgs['Source'] == srcpkg]
+ data.pkgs[data.pkgs['_srcpkg'] == srcpkg]
for srcpkg in srcpkg_names])
binpkg_names = sorted(binpkgs['Package'], key=len)
print binpkg_names
@@ -241,7 +241,7 @@ def main():
if len(args) == 0:
# XXX use upstream names
- srcps = sorted(set(data.pkgs['Source']))
+ srcps = sorted(set(data.pkgs['_srcpkg']))
for pkgname in srcps[:100]:
export(data, pkgname)
diff --git a/load_packages.py b/load_packages.py
index 423b227..e16c90e 100644
--- a/load_packages.py
+++ b/load_packages.py
@@ -4,17 +4,24 @@ import sys
import debian.deb822
import pandas as pd
+def pkg(para):
+ d = dict(para)
+
+ if 'Source' in d:
+ # Source fields sometimes have the source version number; strip it.
+ d['_srcpkg'] = d['Source'].split(' ')[0]
+ else:
+ # No 'Source' field means that it has the same value as the 'Package'
+ # field.
+ d['_srcpkg'] = d['Package']
+
+ return d
+
if __name__ == '__main__':
packages = debian.deb822.Packages.iter_paragraphs(sys.stdin)
- df = pd.DataFrame([dict(p) for p in packages])
+ df = pd.DataFrame([pkg(p) for p in packages])
store = pd.HDFStore('pkg.h5')
- # No 'Source' field means that it has the same value as the 'Package'
- # field. Set this explicitly.
- nosrc = df['Source'].isnull()
- df['Source'][nosrc] = df[nosrc]['Package']
- assert sum(pd.isnull(df['Source'])) == 0
-
print df
store = pd.HDFStore('pkg.h5')