aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README4
-rw-r--r--load_descriptions.py12
2 files changed, 16 insertions, 0 deletions
diff --git a/README b/README
index b412693..3eeb444 100644
--- a/README
+++ b/README
@@ -8,6 +8,10 @@ Loading data from package files:
Packages files can be obtained from Debian mirrors, and are cached by APT in
/var/lib/apt/lists.
+Loading package descriptions:
+
+ $ pv .../Translation-en | python load_descriptions.py
+
Loading data from copyright files:
$ python load_copyright.py main/*/*/current/copyright | tee cp_import.log
diff --git a/load_descriptions.py b/load_descriptions.py
new file mode 100644
index 0000000..36d6257
--- /dev/null
+++ b/load_descriptions.py
@@ -0,0 +1,12 @@
+
+import sys
+
+import debian.deb822
+import pandas as pd
+
+descs = debian.deb822.Packages.iter_paragraphs(sys.stdin)
+df = pd.DataFrame([dict(p) for p in descs])
+store = pd.HDFStore('pkg.h5')
+store['descriptions'] = df
+store.close()
+