From 7ac7787095c3fe6c730a16523d7e00785c6c40cd Mon Sep 17 00:00:00 2001 From: Dafydd Harries Date: Tue, 12 Mar 2013 19:53:28 -0400 Subject: import --- README | 21 +++++++++ load_copyright.py | 130 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ load_packages.py | 22 +++++++++ 3 files changed, 173 insertions(+) create mode 100644 README create mode 100644 load_copyright.py create mode 100644 load_packages.py diff --git a/README b/README new file mode 100644 index 0000000..b412693 --- /dev/null +++ b/README @@ -0,0 +1,21 @@ + +## Importing data + +Loading data from package files: + + $ pv .../Packages python | python load_packages.py + +Packages files can be obtained from Debian mirrors, and are cached by APT in +/var/lib/apt/lists. + +Loading data from copyright files: + + $ python load_copyright.py main/*/*/current/copyright | tee cp_import.log + +Unfortunately, I don't know of a way to easily and quickly get copyright files +for all packages in main if you are not a Debian developer. I obtained them by +logging into powell.debian.org (which hosts.packages.debian.org) and running: + + $ cd /srv/packages.debian.org/www/changelogs/pool + $ tar -zchf ~/copyright.tar.gz main/*/*/current/copyright + diff --git a/load_copyright.py b/load_copyright.py new file mode 100644 index 0000000..4016857 --- /dev/null +++ b/load_copyright.py @@ -0,0 +1,130 @@ + +import os +import sys + +import debian.deb822 +import pandas as pd + +#from IPython.core import ultratb +#sys.excepthook = ultratb.FormattedTB(mode='Verbose', +# color_scheme='Linux', call_pdb=1) + +class BadFormat(Exception): + pass + +def read_copyright(fh): + paras = debian.deb822.Deb822.iter_paragraphs(fh) + + try: + header = paras.next() + + header = dict(header) + paras = [(p.keys()[0], dict(p)) for p in paras] + except (KeyError, TypeError, StopIteration): + raise BadFormat('not in DEP-5 format?') + + if 'Format' not in header and 'Format-Specification' not in header: + raise ValueError('no Format field') + + files = [] + licences = [] + file_fields = set([ + 'Authors', 'Files', 'Comment', 'Copyright', 'Disclaimer', 'Homepage', + 'License', 'License-Alias', 'Upstream-Authors', 'X-Comment']) + + for (type, d) in paras: + type = type.replace('Licence', 'License') + + if 'Licence' in d: + # !!! + d['License'] = d['Licence'] + del d['Licence'] + + if type == 'Files': + if 'License' not in d: + raise ValueError('no license: ' + repr(d)) + + keys = set(d.keys()) + assert keys <= file_fields, keys + files.append(d) + elif type == 'License': + # XXX constrain permissible keys here? + #print d.keys() + licences.append(d) + else: + # Be conservative. Missing license information is a problem. + raise ValueError('bad para: ' + type) + + return (header, files, licences) + +def import_one(pkgname, fh): + try: + (header, files, licences) = read_copyright(fh) + except BadFormat: + print 'info: not readable' + return None + except ValueError, e: + print 'err:', e + #print 'err:', repr(e) + return None + + header['Package'] = pkgname + copy_summary = pd.DataFrame([header]) + #print copy_summary.T.to_string() + #print + + for d in files: + d['Package'] = pkgname + d['_license'] = d['License'].split('\n')[0] + + for d in licences: + d['Package'] = pkgname + d['_license'] = d['License'].split('\n')[0] + + copy_files = pd.DataFrame(files) + licence = pd.DataFrame(licences) + return (copy_summary, copy_files, licence) + +def get_pkgname(path): + (dir, base) = os.path.split(path) + + if base in ('current', 'copyright'): + return get_pkgname(dir) + else: + return base + +def main(paths): + summaries = [] + files = [] + licenses = [] + + for path in paths: + pkgname = get_pkgname(path) + print pkgname, path + data = import_one(pkgname, file(path)) + + if data is not None: + (summary, file_, license) = data + summaries.append(summary) + files.append(file_) + licenses.append(license) + + summaries = pd.concat(summaries) + files = pd.concat(files) + licenses = pd.concat(licenses) + + #from IPython import embed + #embed() + + #from IPython.core.debugger import Pdb + #Pdb().set_trace() + + store = pd.HDFStore('cp.h5') + store['cp_summary'] = summaries + store['cp_files'] = files + store['licenses'] = licenses + store.close() + +if __name__ == '__main__': + main(sys.argv[1:]) + diff --git a/load_packages.py b/load_packages.py new file mode 100644 index 0000000..c02eabd --- /dev/null +++ b/load_packages.py @@ -0,0 +1,22 @@ + +import sys + +import debian.deb822 +import pandas as pd + +packages = debian.deb822.Packages.iter_paragraphs(sys.stdin) +df = pd.DataFrame([dict(p) for p in packages]) +store = pd.HDFStore('pkg.h5') + +# No 'Source' field means that it has the same value as the 'Package' field. +# Set this explicitly. +nosrc = df['Source'].isnull() +df['Source'][nosrc] = df[nosrc]['Package'] +assert sum(pd.isnull(df['Source'])) == 0 + +print df + +store = pd.HDFStore('pkg.h5') +store['packages'] = df +store.close() + -- cgit v1.2.3