1 files changed, 0 insertions, 419 deletions
diff --git a/export.py b/export.py
deleted file mode 100644
index bf45e5e..0000000
--- a/export.py
+++ /dev/null
@@ -1,419 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import datetime
-import itertools
-import json
-import os
-import re
-import sys
-import textwrap
-
-import pandas as pd
-
-import license
-
-# Package fields which refer to download locations.
-download_keys = """
-    Origin
-    Original-Source
-    Source
-    Source-Code
-    X-Origin
-    X-Original-Package
-    X-Source
-    """
-
-def concat(xss):
-    all = []
-
-    for xs in xss:
-        all.extend(xs)
-
-    return all
-
-def indent(s):
-    return ''.join(
-            '  %s\n' % line if line else '\n'
-            for line in s.splitlines())
-
-def today():
-    return datetime.datetime.now().strftime('%Y-%m-%d')
-
-def warn(*x):
-    for s in ('warning:',) + x + ('\n',):
-        print >>sys.stderr, s,
-
-class ExportFailure(Exception):
-    pass
-
-class PkgData(object):
-    def __init__(self):
-        pkg_store = pd.HDFStore('pkg.h5')
-        self.pkgs = pkg_store['packages']
-        self.descs = pkg_store['descriptions']
-        self.srcs = pkg_store['sources']
-        pkg_store.close()
-
-        cp_store = pd.HDFStore('cp.h5')
-        self.cpf = cp_store['cp_files']
-        self.cps = cp_store['cp_summary']
-        self.licenses = cp_store['licenses']
-        cp_store.close()
-
-        cl_store = pd.HDFStore('cl.h5')
-        self.cl = cl_store['cl_versions']
-        cl_store.close()
-
-def nouni(s):
-    return s.encode('utf8') if isinstance(s, unicode) else s
-
-class Template(object):
-    def __init__(self, name, values):
-        self.name = name
-        self.values = values
-
-    def __str__(self):
-        return '{{%s\n%s\n}}' % (
-            nouni(self.name),
-            '\n'.join(['|' + '%s=%s' %
-                (nouni(n), nouni(v))
-                for (n, v) in self.values]))
-
-def parse_tags(s):
-    return s.replace('\n', '').split(', ')
-
-def extract_languages(tags):
-    langs = []
-
-    for tag in tags:
-        (a, b) = tag.split('::')
-
-        if a == 'implemented-in':
-            langs.append(b)
-        elif a == 'devel' and b.startswith('lang:'):
-            langs.append(b.split(':')[1])
-
-    return list(set(langs))
-
-def catechise(s):
-    heresies = ["open source", "debian", "(?<!gnu/)linux", "creative commons"]
-    pattern = '\\b(%s)\\b' % '|'.join([h.replace(' ', '.') for h in heresies])
-    return re.sub(pattern,
-        lambda m: '??%s??' % m.group(1).replace('\n', ' '),
-        s,
-        re.DOTALL | re.IGNORECASE)
-
-def munge_description(s):
-    paras = s.split('\n .\n')
-    return '\n\n'.join(
-        textwrap.fill(para.lstrip().replace('\n', ''), 65)
-        for para in paras)
-
-def get_license_map():
-    map = {}
-
-    for para in file('license_map').read().split('\n\n'):
-        if not para:
-            continue
-
-        match = re.match('\[([^\]]+)\]', para)
-        assert match, para
-        canonical = match.group(1)
-        aliases = para[match.end():].lstrip().splitlines()
-
-        for alias in aliases:
-            map[alias] = canonical
-
-    return map
-
-def srcpkg_extract_licenses(header, filess, licenses, cl_date, cl_uploader):
-    # XXX: generate template from header stanza
-    # XXX: flag CC licenses
-    # XXX: check all License stanzas were included
-    # XXX: exclude licenses for Files: debian/*
-    lmap = get_license_map()
-    by_name = dict([
-        (s['_license'],
-            s['License'].split('\n', 1)[1]
-            if '\n' in s['License']
-            else s['License'])
-        for (_idx, s) in licenses.iterrows()])
-
-    for (_ix, files) in filess.iterrows():
-        ldesc = files['_license'].strip().lower()
-        ltext = files['License']
-
-        if '\n' in ltext:
-            # Looks like license text is included directly.
-            ltext = munge_description(ltext)
-            txt = 'License: %s\n\n%s' % (ldesc, ltext)
-        elif ldesc in by_name:
-            # License information is a stub. Try to find the corresponding
-            # text(s).
-
-            ltext = munge_description(by_name[ldesc])
-            txt = 'License: %s\n\n%s' % (ldesc, ltext)
-        else:
-            parsed = license.parse_licenses(ldesc)
-            lnames = list(parsed.flatten())
-            missing = set(lnames) - set(by_name.keys())
-
-            if missing:
-                txt = 'License: %s' % (parsed)
-
-        canon = lmap.get(ldesc.lower(), 'Other')
-        # XXX: Should maybe bail if there's no copyright field.
-        cp = ''.join(
-            u'Copyright %s' % line.lstrip()
-            for line in files.dropna().get('Copyright', '').splitlines())
-        cp = cp.encode('utf8')
-        txt = txt.encode('utf8')
-
-        yield Template('Project license', [
-            ('License', canon),
-            ('License copyright', cp),
-            ('License verified by', 'Debian: %s' % cl_uploader),
-            ('License verified date', cl_date),
-            ('License note', txt)])
-
-def parse_person(s):
-    match = re.match('([^<]+)\s+<([^>]+)>', s)
-
-    if match:
-        return (match.group(1), match.group(2))
-    else:
-        return (s, '')
-
-def extract_people(df):
-    # XXX: extract contributors, maintainers
-    df = df.dropna()
-
-    if 'Upstream-Contact' in df:
-        (name, email) = parse_person(df['Upstream-Contact'])
-        yield Template('Person', [
-            ('Real name', name),
-            ('Role', 'contact'),
-            ('Email', email)])
-
-def extract_resources(cp_header):
-    cp_header = cp_header.dropna()
-
-    for key in re.findall('\S+', download_keys):
-        if key in cp_header:
-            yield Template('Resource', [
-                ('Resource kind', 'Download'),
-                ('Resource URL', cp_header[key])])
-
-def export_srcpkgs(data, name, srcpkg_names):
-    """Export a package by reference to its constituent source packages.
-
-    This coordinates all the information that goes into a particular page.
-    """
-
-    # Map source package names to binary packages, and also make note
-    # of which versions of those source packages we're looking at.
-    binpkgs = pd.concat([
-        data.pkgs[data.pkgs['_srcpkg'] == srcpkg]
-        for srcpkg in srcpkg_names])
-    versions = {}
-
-    srcfiles=""
-    for srcpkg in srcpkg_names:
-        srcfile = data.srcs[data.srcs['_srcpkg'] == srcpkg]['srcfile'].values[0]
-        letter = srcfile[0]
-        if srcfile[:3] == 'lib':
-            letter = srcfile[:4]
-        srcfile = 'http://ftp.debian.org/debian/pool/main/%s/%s/%s' % (letter,srcpkg,srcfile)
-        srcfiles = srcfiles + srcfile + " "
-
-    for (_i, pkg) in binpkgs.iterrows():
-        versions[pkg['_srcpkg']] = pkg['Version']
-
-    if len(binpkgs) == 0:
-        warn('no binary packages found for', srcpkg_names)
-        return
-
-    binpkg_names = sorted(binpkgs['Package'], key=len)
-    homepages = list(binpkgs['Homepage'])
-    # XXX: maybe choose the one that appears the most?
-    homepage = homepages[0] if homepages else ''
-    tags = set(concat(
-        [parse_tags(t) for t in binpkgs['Tag'] if not pd.isnull(t)]))
-    langs = [s.title() for s in extract_languages(tags)]
-
-    if name in binpkg_names:
-        descpkg = name
-    else:
-        # Heuristic: choose the package with the shortest name.
-        # We could try to do something smarter, like look for the common
-        # prefix of the descriptions of all the binary packages.
-        descpkg = binpkg_names[0]
-    try:
-        desc = list(data.descs[
-            data.descs['Package'] == descpkg]['Description-en'])[0]
-        (short_desc, full_desc) = desc.split('\n', 1)
-    except:
-        full_desc = ''
-        short_desc = ''
-    full_desc = catechise(munge_description(full_desc))
-
-    yield Template('Entry', [
-        ('Name', name.capitalize()),
-        ('Short description', short_desc),
-        ('Full description', full_desc),
-        ('Homepage URL', homepage),
-        ('User level', ''),
-        # XXX get this information from apt-file
-        ('Component programs', ''),
-        ('VCS checkout command', ''),
-        ('Computer languages', ', '.join(langs)),
-        ('Status', ''),
-        ('Is GNU', 'No'),
-        ('Version identifier', pkg['Version']),
-        ('Version download', srcfiles),
-        ('Submitted by', 'Debian import'),
-        ('Submitted date', today())])
-
-    yield Template('Import', [
-        ('Source', 'Debian'),
-        ('Source link',
-            'http://packages.debian.org/sid/' + srcpkg_names[0]),
-        ('Source packages',
-             ', '.join('%s %s' % (k, v) for (k, v) in versions.iteritems())),
-        ('Date', today())])
-
-    people = []
-    res = []
-
-    for srcpkg in srcpkg_names:
-        pkg_cps = data.cps[data.cps['_srcpkg'] == srcpkg].ix[0]
-        pkg_cpf = data.cpf[data.cpf['_srcpkg'] == srcpkg]
-        pkg_licenses = data.licenses[data.licenses['_srcpkg'] == srcpkg]
-        people.extend(list(extract_people(pkg_cps)))
-        res.extend(list(extract_resources(pkg_cps)))
-
-        pkg_cl = data.cl[data.cl['_srcpkg'] == srcpkg]
-        cl_date = pkg_cl['date']
-        # avoid rare bad index error:
-        if cl_date.empty:
-            continue
-        cl_date = cl_date[0]
-        cl_uploader = pkg_cl['author'][0]
-        for template in srcpkg_extract_licenses(
-                pkg_cps, pkg_cpf, pkg_licenses, cl_date, cl_uploader):
-            # XXX: eliminate duplicates
-            yield template
-
-    for template in people:
-        # XXX: eliminate duplicates
-        yield template
-
-    for template in res:
-        # XXX: eliminate duplicates
-        yield template
-
-    #yield Template('Software category', [
-    #    ('Resource kind', ''),
-    #    ('Resource URL', '')])
-
-def filename(s):
-    s_ = re.sub('[^A-Za-z0-9_+.-]', '_', s)
-    assert s_, s
-    return s_ + '.wiki'
-
-def output(path, xs):
-    with open(path, 'w') as f:
-        for x in xs:
-            f.write(str(x) + '\n')
-
-def output_multi(path, xs):
-    "Output a bunch of pages to a directory with an index."
-
-    index = {}
-
-    if not os.path.exists(path):
-        os.makedirs(path)
-
-    for (name, templates) in xs:
-        fname = filename(name)
-        fpath = os.path.join(path, fname)
-        index[fname] = {'page': name, 'file': fname}
-        output(fpath, templates)
-
-    fpath = os.path.join(path, 'index.json')
-    json.dump(index, file(fpath, 'w'))
-
-def uname_srcpkgs(data, name):
-    pkg_cps = data.cps[data.cps['Upstream-Name'] == name]
-    srcpkg_names = list(pkg_cps['_srcpkg'])
-    return srcpkg_names
-
-def export_all(data):
-    """Export all packages.
-
-    Returns a generator of (name, templates) tuples.
-    """
-
-    # First, find all upstream names and the source packages corresponding
-    # to them.
-
-    unames = sorted(set(data.cps['Upstream-Name'].dropna()))
-
-    # For source packages with no upstream name, use the source package
-    # name as the upstream name.
-
-    no_uname = sorted(set(data.cps[
-        data.cps['Upstream-Name'].isnull()]['_srcpkg']))
-
-    packages = itertools.chain(
-        ((uname, uname_srcpkgs(data, uname)) for uname in unames),
-        ((srcpkg, [srcpkg]) for srcpkg in no_uname))
-
-    for (name, srcpkgs) in packages:
-        if not name:
-            continue
-
-        if '\n' in name:
-            # Seriously?
-            warn('bad name: %r' % name)
-            continue
-
-        # Generator; exceptions are delayed.
-        templates = export_srcpkgs(data, name, srcpkgs)
-        yield (name, templates)
-
-def export_all_to_directory(data, outputdir):
-    def _export():
-        for (name, templates) in export_all(data):
-            print (name.encode('utf8') if isinstance(name, unicode) else name)
-
-            try:
-                # Force errors.
-                templates = list(templates)
-            except ExportFailure, e:
-                warn('export failed: %s: %s' % (name.encode('utf-8').strip(), e.message.encode('utf-8').strip()))
-
-            yield(name, templates)
-
-    output_multi(outputdir, _export())
-
-def main():
-    data = PkgData()
-    args = sys.argv[1:]
-
-    if len(args) == 0:
-        export_all_to_directory(data, 'output')
-    elif len(args) == 1:
-        # XXX: assumes argument is an upstream name
-        uname = args[0]
-        srcpkgs = uname_srcpkgs(data, uname)
-        templates = export_srcpkgs(data, uname, srcpkgs)
-
-        for template in templates:
-            print template
-    else:
-        raise RuntimeError()
-
-if __name__ == '__main__':
-    main()