aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDafydd Harries <daf@rhydd.org>2013-03-12 19:53:28 -0400
committerDafydd Harries <daf@rhydd.org>2013-03-12 19:53:28 -0400
commit7ac7787095c3fe6c730a16523d7e00785c6c40cd (patch)
tree56b9c3cbc55dd53dc60f9eb640e28ce595460314
import
-rw-r--r--README21
-rw-r--r--load_copyright.py130
-rw-r--r--load_packages.py22
3 files changed, 173 insertions, 0 deletions
diff --git a/README b/README
new file mode 100644
index 0000000..b412693
--- /dev/null
+++ b/README
@@ -0,0 +1,21 @@
+
+## Importing data
+
+Loading data from package files:
+
+ $ pv .../Packages python | python load_packages.py
+
+Packages files can be obtained from Debian mirrors, and are cached by APT in
+/var/lib/apt/lists.
+
+Loading data from copyright files:
+
+ $ python load_copyright.py main/*/*/current/copyright | tee cp_import.log
+
+Unfortunately, I don't know of a way to easily and quickly get copyright files
+for all packages in main if you are not a Debian developer. I obtained them by
+logging into powell.debian.org (which hosts.packages.debian.org) and running:
+
+ $ cd /srv/packages.debian.org/www/changelogs/pool
+ $ tar -zchf ~/copyright.tar.gz main/*/*/current/copyright
+
diff --git a/load_copyright.py b/load_copyright.py
new file mode 100644
index 0000000..4016857
--- /dev/null
+++ b/load_copyright.py
@@ -0,0 +1,130 @@
+
+import os
+import sys
+
+import debian.deb822
+import pandas as pd
+
+#from IPython.core import ultratb
+#sys.excepthook = ultratb.FormattedTB(mode='Verbose',
+# color_scheme='Linux', call_pdb=1)
+
+class BadFormat(Exception):
+ pass
+
+def read_copyright(fh):
+ paras = debian.deb822.Deb822.iter_paragraphs(fh)
+
+ try:
+ header = paras.next()
+
+ header = dict(header)
+ paras = [(p.keys()[0], dict(p)) for p in paras]
+ except (KeyError, TypeError, StopIteration):
+ raise BadFormat('not in DEP-5 format?')
+
+ if 'Format' not in header and 'Format-Specification' not in header:
+ raise ValueError('no Format field')
+
+ files = []
+ licences = []
+ file_fields = set([
+ 'Authors', 'Files', 'Comment', 'Copyright', 'Disclaimer', 'Homepage',
+ 'License', 'License-Alias', 'Upstream-Authors', 'X-Comment'])
+
+ for (type, d) in paras:
+ type = type.replace('Licence', 'License')
+
+ if 'Licence' in d:
+ # !!!
+ d['License'] = d['Licence']
+ del d['Licence']
+
+ if type == 'Files':
+ if 'License' not in d:
+ raise ValueError('no license: ' + repr(d))
+
+ keys = set(d.keys())
+ assert keys <= file_fields, keys
+ files.append(d)
+ elif type == 'License':
+ # XXX constrain permissible keys here?
+ #print d.keys()
+ licences.append(d)
+ else:
+ # Be conservative. Missing license information is a problem.
+ raise ValueError('bad para: ' + type)
+
+ return (header, files, licences)
+
+def import_one(pkgname, fh):
+ try:
+ (header, files, licences) = read_copyright(fh)
+ except BadFormat:
+ print 'info: not readable'
+ return None
+ except ValueError, e:
+ print 'err:', e
+ #print 'err:', repr(e)
+ return None
+
+ header['Package'] = pkgname
+ copy_summary = pd.DataFrame([header])
+ #print copy_summary.T.to_string()
+ #print
+
+ for d in files:
+ d['Package'] = pkgname
+ d['_license'] = d['License'].split('\n')[0]
+
+ for d in licences:
+ d['Package'] = pkgname
+ d['_license'] = d['License'].split('\n')[0]
+
+ copy_files = pd.DataFrame(files)
+ licence = pd.DataFrame(licences)
+ return (copy_summary, copy_files, licence)
+
+def get_pkgname(path):
+ (dir, base) = os.path.split(path)
+
+ if base in ('current', 'copyright'):
+ return get_pkgname(dir)
+ else:
+ return base
+
+def main(paths):
+ summaries = []
+ files = []
+ licenses = []
+
+ for path in paths:
+ pkgname = get_pkgname(path)
+ print pkgname, path
+ data = import_one(pkgname, file(path))
+
+ if data is not None:
+ (summary, file_, license) = data
+ summaries.append(summary)
+ files.append(file_)
+ licenses.append(license)
+
+ summaries = pd.concat(summaries)
+ files = pd.concat(files)
+ licenses = pd.concat(licenses)
+
+ #from IPython import embed
+ #embed()
+
+ #from IPython.core.debugger import Pdb
+ #Pdb().set_trace()
+
+ store = pd.HDFStore('cp.h5')
+ store['cp_summary'] = summaries
+ store['cp_files'] = files
+ store['licenses'] = licenses
+ store.close()
+
+if __name__ == '__main__':
+ main(sys.argv[1:])
+
diff --git a/load_packages.py b/load_packages.py
new file mode 100644
index 0000000..c02eabd
--- /dev/null
+++ b/load_packages.py
@@ -0,0 +1,22 @@
+
+import sys
+
+import debian.deb822
+import pandas as pd
+
+packages = debian.deb822.Packages.iter_paragraphs(sys.stdin)
+df = pd.DataFrame([dict(p) for p in packages])
+store = pd.HDFStore('pkg.h5')
+
+# No 'Source' field means that it has the same value as the 'Package' field.
+# Set this explicitly.
+nosrc = df['Source'].isnull()
+df['Source'][nosrc] = df[nosrc]['Package']
+assert sum(pd.isnull(df['Source'])) == 0
+
+print df
+
+store = pd.HDFStore('pkg.h5')
+store['packages'] = df
+store.close()
+