diff options
| author | Dafydd Harries <daf@rhydd.org> | 2013-03-12 19:53:28 -0400 | 
|---|---|---|
| committer | Dafydd Harries <daf@rhydd.org> | 2013-03-12 19:53:28 -0400 | 
| commit | 7ac7787095c3fe6c730a16523d7e00785c6c40cd (patch) | |
| tree | 56b9c3cbc55dd53dc60f9eb640e28ce595460314 | |
import
| -rw-r--r-- | README | 21 | ||||
| -rw-r--r-- | load_copyright.py | 130 | ||||
| -rw-r--r-- | load_packages.py | 22 | 
3 files changed, 173 insertions, 0 deletions
| @@ -0,0 +1,21 @@ + +## Importing data + +Loading data from package files: + +    $ pv .../Packages python | python load_packages.py + +Packages files can be obtained from Debian mirrors, and are cached by APT in +/var/lib/apt/lists. + +Loading data from copyright files: + +    $ python load_copyright.py main/*/*/current/copyright | tee cp_import.log + +Unfortunately, I don't know of a way to easily and quickly get copyright files +for all packages in main if you are not a Debian developer. I obtained them by +logging into powell.debian.org (which hosts.packages.debian.org) and running: + +    $ cd /srv/packages.debian.org/www/changelogs/pool +    $ tar -zchf ~/copyright.tar.gz main/*/*/current/copyright + diff --git a/load_copyright.py b/load_copyright.py new file mode 100644 index 0000000..4016857 --- /dev/null +++ b/load_copyright.py @@ -0,0 +1,130 @@ + +import os +import sys + +import debian.deb822 +import pandas as pd + +#from IPython.core import ultratb +#sys.excepthook = ultratb.FormattedTB(mode='Verbose', +#     color_scheme='Linux', call_pdb=1) + +class BadFormat(Exception): +    pass + +def read_copyright(fh): +    paras = debian.deb822.Deb822.iter_paragraphs(fh) + +    try: +        header = paras.next() + +        header = dict(header) +        paras = [(p.keys()[0], dict(p)) for p in paras] +    except (KeyError, TypeError, StopIteration): +        raise BadFormat('not in DEP-5 format?') + +    if 'Format' not in header and 'Format-Specification' not in header: +        raise ValueError('no Format field') + +    files = [] +    licences = [] +    file_fields = set([ +        'Authors', 'Files', 'Comment', 'Copyright', 'Disclaimer', 'Homepage', +        'License', 'License-Alias', 'Upstream-Authors', 'X-Comment']) + +    for (type, d) in paras: +        type = type.replace('Licence', 'License') + +        if 'Licence' in d: +            # !!! +            d['License'] = d['Licence'] +            del d['Licence'] + +        if type == 'Files': +            if 'License' not in d: +                raise ValueError('no license: ' + repr(d)) + +            keys = set(d.keys()) +            assert keys <= file_fields, keys +            files.append(d) +        elif type == 'License': +            # XXX constrain permissible keys here? +            #print d.keys() +            licences.append(d) +        else: +            # Be conservative. Missing license information is a problem. +            raise ValueError('bad para: ' + type) + +    return (header, files, licences) + +def import_one(pkgname, fh): +    try: +        (header, files, licences) = read_copyright(fh) +    except BadFormat: +        print 'info: not readable' +        return None +    except ValueError, e: +        print 'err:', e +        #print 'err:', repr(e) +        return None + +    header['Package'] = pkgname +    copy_summary = pd.DataFrame([header]) +    #print copy_summary.T.to_string() +    #print + +    for d in files: +        d['Package'] = pkgname +        d['_license'] = d['License'].split('\n')[0] + +    for d in licences: +        d['Package'] = pkgname +        d['_license'] = d['License'].split('\n')[0] + +    copy_files = pd.DataFrame(files) +    licence = pd.DataFrame(licences) +    return (copy_summary, copy_files, licence) + +def get_pkgname(path): +    (dir, base) = os.path.split(path) + +    if base in ('current', 'copyright'): +        return get_pkgname(dir) +    else: +        return base + +def main(paths): +    summaries = [] +    files = [] +    licenses = [] + +    for path in paths: +        pkgname = get_pkgname(path) +        print pkgname, path +        data = import_one(pkgname, file(path)) + +        if data is not None: +            (summary, file_, license) = data +            summaries.append(summary) +            files.append(file_) +            licenses.append(license) + +    summaries = pd.concat(summaries) +    files = pd.concat(files) +    licenses = pd.concat(licenses) + +    #from IPython import embed +    #embed() + +    #from IPython.core.debugger import Pdb +    #Pdb().set_trace() + +    store = pd.HDFStore('cp.h5') +    store['cp_summary'] = summaries +    store['cp_files'] = files +    store['licenses'] = licenses +    store.close() + +if __name__ == '__main__': +    main(sys.argv[1:]) + diff --git a/load_packages.py b/load_packages.py new file mode 100644 index 0000000..c02eabd --- /dev/null +++ b/load_packages.py @@ -0,0 +1,22 @@ + +import sys + +import debian.deb822 +import pandas as pd + +packages = debian.deb822.Packages.iter_paragraphs(sys.stdin) +df = pd.DataFrame([dict(p) for p in packages]) +store = pd.HDFStore('pkg.h5') + +# No 'Source' field means that it has the same value as the 'Package' field. +# Set this explicitly. +nosrc = df['Source'].isnull() +df['Source'][nosrc] = df[nosrc]['Package'] +assert sum(pd.isnull(df['Source'])) == 0 + +print df + +store = pd.HDFStore('pkg.h5') +store['packages'] = df +store.close() + | 
