path: root/export.py


                       
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import datetime
import itertools
import json
import os
import re
import sys
import textwrap

import pandas as pd

import license

# Package fields which refer to download locations.
download_keys = """
    Origin
    Original-Source
    Source
    Source-Code
    X-Origin
    X-Original-Package
    X-Source
    """

def concat(xss):
    all = []

    for xs in xss:
        all.extend(xs)

    return all

def indent(s):
    return ''.join(
            '  %s\n' % line if line else '\n'
            for line in s.splitlines())

def today():
    return datetime.datetime.now().strftime('%Y-%m-%d')

def warn(*x):
    for s in ('warning:',) + x + ('\n',):
        print >>sys.stderr, s,

class ExportFailure(Exception):
    pass

class PkgData(object):
    def __init__(self):
        pkg_store = pd.HDFStore('pkg.h5')
        self.pkgs = pkg_store['packages']
        self.descs = pkg_store['descriptions']
        self.srcs = pkg_store['sources']
        pkg_store.close()

        cp_store = pd.HDFStore('cp.h5')
        self.cpf = cp_store['cp_files']
        self.cps = cp_store['cp_summary']
        self.licenses = cp_store['licenses']
        cp_store.close()

        cl_store = pd.HDFStore('cl.h5')
        self.cl = cl_store['cl_versions']
        cl_store.close()

def nouni(s):
    return s.encode('utf8') if isinstance(s, unicode) else s

class Template(object):
    def __init__(self, name, values):
        self.name = name
        self.values = values

    def __str__(self):
        return '{{%s\n%s\n}}' % (
            nouni(self.name),
            '\n'.join(['|' + '%s=%s' %
                (nouni(n), nouni(v))
                for (n, v) in self.values]))

def parse_tags(s):
    return s.replace('\n', '').split(', ')

def extract_languages(tags):
    langs = []

    for tag in tags:
        (a, b) = tag.split('::')

        if a == 'implemented-in':
            langs.append(b)
        elif a == 'devel' and b.startswith('lang:'):
            langs.append(b.split(':')[1])

    return list(set(langs))

def catechise(s):
    heresies = ["open source", "debian", "(?<!gnu/)linux", "creative commons"]
    pattern = '\\b(%s)\\b' % '|'.join([h.replace(' ', '.') for h in heresies])
    return re.sub(pattern,
        lambda m: '??%s??' % m.group(1).replace('\n', ' '),
        s,
        re.DOTALL | re.IGNORECASE)

def munge_description(s):
    paras = s.split('\n .\n')
    return '\n\n'.join(
        textwrap.fill(para.lstrip().replace('\n', ''), 65)
        for para in paras)

def get_license_map():
    map = {}

    for para in file('license_map').read().split('\n\n'):
        if not para:
            continue

        match = re.match('\[([^\]]+)\]', para)
        assert match, para
        canonical = match.group(1)
        aliases = para[match.end():].lstrip().splitlines()

        for alias in aliases:
            map[alias] = canonical

    return map

def srcpkg_extract_licenses(header, filess, licenses, cl_date, cl_uploader):
    # XXX: generate template from header stanza
    # XXX: flag CC licenses
    # XXX: check all License stanzas were included
    # XXX: exclude licenses for Files: debian/*
    lmap = get_license_map()
    by_name = dict([
        (s['_license'],
            s['License'].split('\n', 1)[1]
            if '\n' in s['License']
            else s['License'])
        for (_idx, s) in licenses.iterrows()])

    for (_ix, files) in filess.iterrows():
        ldesc = files['_license'].strip().lower()
        ltext = files['License']

        if '\n' in ltext:
            # Looks like license text is included directly.
            ltext = munge_description(ltext)
            txt = 'License: %s\n\n%s' % (ldesc, ltext)
        elif ldesc in by_name:
            # License information is a stub. Try to find the corresponding
            # text(s).

            ltext = munge_description(by_name[ldesc])
            txt = 'License: %s\n\n%s' % (ldesc, ltext)
        else:
            parsed = license.parse_licenses(ldesc)
            lnames = list(parsed.flatten())
            missing = set(lnames) - set(by_name.keys())

            if missing:
                txt = 'License: %s' % (parsed)

        canon = lmap.get(ldesc.lower(), 'Other')
        # XXX: Should maybe bail if there's no copyright field.
        cp = ''.join(
            u'Copyright %s' % line.lstrip()
            for line in files.dropna().get('Copyright', '').splitlines())
        cp = cp.encode('utf8')
        txt = txt.encode('utf8')

        yield Template('Project license', [
            ('License', canon),
            ('License copyright', cp),
            ('License verified by', 'Debian: %s' % cl_uploader),
            ('License verified date', cl_date),
            ('License note', txt)])

def parse_person(s):
    match = re.match('([^<]+)\s+<([^>]+)>', s)

    if match:
        return (match.group(1), match.group(2))
    else:
        return (s, '')

def extract_people(df):
    # XXX: extract contributors, maintainers
    df = df.dropna()

    if 'Upstream-Contact' in df:
        (name, email) = parse_person(df['Upstream-Contact'])
        yield Template('Person', [
            ('Real name', name),
            ('Role', 'contact'),
            ('Email', email)])

def extract_resources(cp_header):
    cp_header = cp_header.dropna()

    for key in re.findall('\S+', download_keys):
        if key in cp_header:
            yield Template('Resource', [
                ('Resource kind', 'Download'),
                ('Resource URL', cp_header[key])])

def export_srcpkgs(data, name, srcpkg_names):
    """Export a package by reference to its constituent source packages.

    This coordinates all the information that goes into a particular page.
    """

    # Map source package names to binary packages, and also make note
    # of which versions of those source packages we're looking at.
    binpkgs = pd.concat([
        data.pkgs[data.pkgs['_srcpkg'] == srcpkg]
        for srcpkg in srcpkg_names])
    versions = {}

    srcfiles=""
    for srcpkg in srcpkg_names:
        srcfile = data.srcs[data.srcs['_srcpkg'] == srcpkg]['srcfile'].values[0]
        letter = srcfile[0]
        if srcfile[:3] == 'lib':
            letter = srcfile[:4]
        srcfile = 'http://ftp.debian.org/debian/pool/main/%s/%s/%s' % (letter,srcpkg,srcfile)
        srcfiles = srcfiles + srcfile + " "

    for (_i, pkg) in binpkgs.iterrows():
        versions[pkg['_srcpkg']] = pkg['Version']

    if len(binpkgs) == 0:
        warn('no binary packages found for', srcpkg_names)
        return

    binpkg_names = sorted(binpkgs['Package'], key=len)
    homepages = list(binpkgs['Homepage'])
    # XXX: maybe choose the one that appears the most?
    homepage = homepages[0] if homepages else ''
    tags = set(concat(
        [parse_tags(t) for t in binpkgs['Tag'] if not pd.isnull(t)]))
    langs = [s.title() for s in extract_languages(tags)]

    if name in binpkg_names:
        descpkg = name
    else:
        # Heuristic: choose the package with the shortest name.
        # We could try to do something smarter, like look for the common
        # prefix of the descriptions of all the binary packages.
        descpkg = binpkg_names[0]
    try:
        desc = list(data.descs[
            data.descs['Package'] == descpkg]['Description-en'])[0]
        (short_desc, full_desc) = desc.split('\n', 1)
    except:
        full_desc = ''
        short_desc = ''
    full_desc = catechise(munge_description(full_desc))

    yield Template('Entry', [
        ('Name', name.capitalize()),
        ('Short description', short_desc),
        ('Full description', full_desc),
        ('Homepage URL', homepage),
        ('User level', ''),
        # XXX get this information from apt-file
        ('Component programs', ''),
        ('VCS checkout command', ''),
        ('Computer languages', ', '.join(langs)),
        ('Status', ''),
        ('Is GNU', 'No'),
        ('Version identifier', pkg['Version']),
        ('Version download', srcfiles),
        ('Submitted by', 'Debian import'),
        ('Submitted date', today())])

    yield Template('Import', [
        ('Source', 'Debian'),
        ('Source link',
            'http://packages.debian.org/sid/' + srcpkg_names[0]),
        ('Source packages',
             ', '.join('%s %s' % (k, v) for (k, v) in versions.iteritems())),
        ('Date', today())])

    people = []
    res = []

    for srcpkg in srcpkg_names:
        pkg_cps = data.cps[data.cps['_srcpkg'] == srcpkg].ix[0]
        pkg_cpf = data.cpf[data.cpf['_srcpkg'] == srcpkg]
        pkg_licenses = data.licenses[data.licenses['_srcpkg'] == srcpkg]
        people.extend(list(extract_people(pkg_cps)))
        res.extend(list(extract_resources(pkg_cps)))

        pkg_cl = data.cl[data.cl['_srcpkg'] == srcpkg]
        cl_date = pkg_cl['date']
        # avoid rare bad index error:
        if cl_date.empty:
            continue
        cl_date = cl_date[0]
        cl_uploader = pkg_cl['author'][0]
        for template in srcpkg_extract_licenses(
                pkg_cps, pkg_cpf, pkg_licenses, cl_date, cl_uploader):
            # XXX: eliminate duplicates
            yield template

    for template in people:
        # XXX: eliminate duplicates
        yield template

    for template in res:
        # XXX: eliminate duplicates
        yield template

    #yield Template('Software category', [
    #    ('Resource kind', ''),
    #    ('Resource URL', '')])

def filename(s):
    s_ = re.sub('[^A-Za-z0-9_+.-]', '_', s)
    assert s_, s
    return s_ + '.wiki'

def output(path, xs):
    with open(path, 'w') as f:
        for x in xs:
            f.write(str(x) + '\n')

def output_multi(path, xs):
    "Output a bunch of pages to a directory with an index."

    index = {}

    if not os.path.exists(path):
        os.makedirs(path)

    for (name, templates) in xs:
        fname = filename(name)
        fpath = os.path.join(path, fname)
        index[fname] = {'page': name, 'file': fname}
        output(fpath, templates)

    fpath = os.path.join(path, 'index.json')
    json.dump(index, file(fpath, 'w'))

def uname_srcpkgs(data, name):
    pkg_cps = data.cps[data.cps['Upstream-Name'] == name]
    srcpkg_names = list(pkg_cps['_srcpkg'])
    return srcpkg_names

def export_all(data):
    """Export all packages.

    Returns a generator of (name, templates) tuples.
    """

    # First, find all upstream names and the source packages corresponding
    # to them.

    unames = sorted(set(data.cps['Upstream-Name'].dropna()))

    # For source packages with no upstream name, use the source package
    # name as the upstream name.

    no_uname = sorted(set(data.cps[
        data.cps['Upstream-Name'].isnull()]['_srcpkg']))

    packages = itertools.chain(
        ((uname, uname_srcpkgs(data, uname)) for uname in unames),
        ((srcpkg, [srcpkg]) for srcpkg in no_uname))

    for (name, srcpkgs) in packages:
        if not name:
            continue

        if '\n' in name:
            # Seriously?
            warn('bad name: %r' % name)
            continue

        # Generator; exceptions are delayed.
        templates = export_srcpkgs(data, name, srcpkgs)
        yield (name, templates)

def export_all_to_directory(data, outputdir):
    def _export():
        for (name, templates) in export_all(data):
            print (name.encode('utf8') if isinstance(name, unicode) else name)

            try:
                # Force errors.
                templates = list(templates)
            except ExportFailure, e:
                warn('export failed: %s: %s' % (name.encode('utf-8').strip(), e.message.encode('utf-8').strip()))

            yield(name, templates)

    output_multi(outputdir, _export())

def main():
    data = PkgData()
    args = sys.argv[1:]

    if len(args) == 0:
        export_all_to_directory(data, 'output')
    elif len(args) == 1:
        # XXX: assumes argument is an upstream name
        uname = args[0]
        srcpkgs = uname_srcpkgs(data, uname)
        templates = export_srcpkgs(data, uname, srcpkgs)

        for template in templates:
            print template
    else:
        raise RuntimeError()

if __name__ == '__main__':
    main()
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import datetime
import itertools
import json
import os
import re
import sys
import textwrap

import pandas as pd

import license

# Package fields which refer to download locations.
download_keys = """
    Origin
    Original-Source
    Source
    Source-Code
    X-Origin
    X-Original-Package
    X-Source
    """

def concat(xss):
    all = []

    for xs in xss:
        all.extend(xs)

    return all

def indent(s):
    return ''.join(
            '  %s\n' % line if line else '\n'
            for line in s.splitlines())

def today():
    return datetime.datetime.now().strftime('%Y-%m-%d')

def warn(*x):
    for s in ('warning:',) + x + ('\n',):
        print >>sys.stderr, s,

class ExportFailure(Exception):
    pass

class PkgData(object):
    def __init__(self):
        pkg_store = pd.HDFStore('pkg.h5')
        self.pkgs = pkg_store['packages']
        self.descs = pkg_store['descriptions']
        self.srcs = pkg_store['sources']
        pkg_store.close()

        cp_store = pd.HDFStore('cp.h5')
        self.cpf = cp_store['cp_files']
        self.cps = cp_store['cp_summary']
        self.licenses = cp_store['licenses']
        cp_store.close()

        cl_store = pd.HDFStore('cl.h5')
        self.cl = cl_store['cl_versions']
        cl_store.close()

def nouni(s):
    return s.encode('utf8') if isinstance(s, unicode) else s

class Template(object):
    def __init__(self, name, values):
        self.name = name
        self.values = values

    def __str__(self):
        return '{{%s\n%s\n}}' % (
            nouni(self.name),
            '\n'.join(['|' + '%s=%s' %
                (nouni(n), nouni(v))
                for (n, v) in self.values]))

def parse_tags(s):
    return s.replace('\n', '').split(', ')

def extract_languages(tags):
    langs = []

    for tag in tags:
        (a, b) = tag.split('::')

        if a == 'implemented-in':
            langs.append(b)
        elif a == 'devel' and b.startswith('lang:'):
            langs.append(b.split(':')[1])

    return list(set(langs))

def catechise(s):
    heresies = ["open source", "debian", "(?<!gnu/)linux", "creative commons"]
    pattern = '\\b(%s)\\b' % '|'.join([h.replace(' ', '.') for h in heresies])
    return re.sub(pattern,
        lambda m: '??%s??' % m.group(1).replace('\n', ' '),
        s,
        re.DOTALL | re.IGNORECASE)

def munge_description(s):
    paras = s.split('\n .\n')
    return '\n\n'.join(
        textwrap.fill(para.lstrip().replace('\n', ''), 65)
        for para in paras)

def get_license_map():
    map = {}

    for para in file('license_map').read().split('\n\n'):
        if not para:
            continue

        match = re.match('\[([^\]]+)\]', para)
        assert match, para
        canonical = match.group(1)
        aliases = para[match.end():].lstrip().splitlines()

        for alias in aliases:
            map[alias] = canonical

    return map

def srcpkg_extract_licenses(header, filess, licenses, cl_date, cl_uploader):
    # XXX: generate template from header stanza
    # XXX: flag CC licenses
    # XXX: check all License stanzas were included
    # XXX: exclude licenses for Files: debian/*
    lmap = get_license_map()
    by_name = dict([
        (s['_license'],
            s['License'].split('\n', 1)[1]
            if '\n' in s['License']
            else s['License'])
        for (_idx, s) in licenses.iterrows()])

    for (_ix, files) in filess.iterrows():
        ldesc = files['_license'].strip().lower()
        ltext = files['License']

        if '\n' in ltext:
            # Looks like license text is included directly.
            ltext = munge_description(ltext)
            txt = 'License: %s\n\n%s' % (ldesc, ltext)
        elif ldesc in by_name:
            # License information is a stub. Try to find the corresponding
            # text(s).

            ltext = munge_description(by_name[ldesc])
            txt = 'License: %s\n\n%s' % (ldesc, ltext)
        else:
            parsed = license.parse_licenses(ldesc)
            lnames = list(parsed.flatten())
            missing = set(lnames) - set(by_name.keys())

            if missing:
                txt = 'License: %s' % (parsed)

        canon = lmap.get(ldesc.lower(), 'Other')
        # XXX: Should maybe bail if there's no copyright field.
        cp = ''.join(
            u'Copyright %s' % line.lstrip()
            for line in files.dropna().get('Copyright', '').splitlines())
        cp = cp.encode('utf8')
        txt = txt.encode('utf8')

        yield Template('Project license', [
            ('License', canon),
            ('License copyright', cp),
            ('License verified by', 'Debian: %s' % cl_uploader),
            ('License verified date', cl_date),
            ('License note', txt)])

def parse_person(s):
    match = re.match('([^<]+)\s+<([^>]+)>', s)

    if match:
        return (match.group(1), match.group(2))
    else:
        return (s, '')

def extract_people(df):
    # XXX: extract contributors, maintainers
    df = df.dropna()

    if 'Upstream-Contact' in df:
        (name, email) = parse_person(df['Upstream-Contact'])
        yield Template('Person', [
            ('Real name', name),
            ('Role', 'contact'),
            ('Email', email)])

def extract_resources(cp_header):
    cp_header = cp_header.dropna()

    for key in re.findall('\S+', download_keys):
        if key in cp_header:
            yield Template('Resource', [
                ('Resource kind', 'Download'),
                ('Resource URL', cp_header[key])])

def export_srcpkgs(data, name, srcpkg_names):
    """Export a package by reference to its constituent source packages.

    This coordinates all the information that goes into a particular page.
    """

    # Map source package names to binary packages, and also make note
    # of which versions of those source packages we're looking at.
    binpkgs = pd.concat([
        data.pkgs[data.pkgs['_srcpkg'] == srcpkg]
        for srcpkg in srcpkg_names])
    versions = {}

    srcfiles=""
    for srcpkg in srcpkg_names:
        srcfile = data.srcs[data.srcs['_srcpkg'] == srcpkg]['srcfile'].values[0]
        letter = srcfile[0]
        if srcfile[:3] == 'lib':
            letter = srcfile[:4]
        srcfile = 'http://ftp.debian.org/debian/pool/main/%s/%s/%s' % (letter,srcpkg,srcfile)
        srcfiles = srcfiles + srcfile + " "

    for (_i, pkg) in binpkgs.iterrows():
        versions[pkg['_srcpkg']] = pkg['Version']

    if len(binpkgs) == 0:
        warn('no binary packages found for', srcpkg_names)
        return

    binpkg_names = sorted(binpkgs['Package'], key=len)
    homepages = list(binpkgs['Homepage'])
    # XXX: maybe choose the one that appears the most?
    homepage = homepages[0] if homepages else ''
    tags = set(concat(
        [parse_tags(t) for t in binpkgs['Tag'] if not pd.isnull(t)]))
    langs = [s.title() for s in extract_languages(tags)]

    if name in binpkg_names:
        descpkg = name
    else:
        # Heuristic: choose the package with the shortest name.
        # We could try to do something smarter, like look for the common
        # prefix of the descriptions of all the binary packages.
        descpkg = binpkg_names[0]
    try:
        desc = list(data.descs[
            data.descs['Package'] == descpkg]['Description-en'])[0]
        (short_desc, full_desc) = desc.split('\n', 1)
    except:
        full_desc = ''
        short_desc = ''
    full_desc = catechise(munge_description(full_desc))

    yield Template('Entry', [
        ('Name', name.capitalize()),
        ('Short description', short_desc),
        ('Full description', full_desc),
        ('Homepage URL', homepage),
        ('User level', ''),
        # XXX get this information from apt-file
        ('Component programs', ''),
        ('VCS checkout command', ''),
        ('Computer languages', ', '.join(langs)),
        ('Status', ''),
        ('Is GNU', 'No'),
        ('Version identifier', pkg['Version']),
        ('Version download', srcfiles),
        ('Submitted by', 'Debian import'),
        ('Submitted date', today())])

    yield Template('Import', [
        ('Source', 'Debian'),
        ('Source link',
            'http://packages.debian.org/sid/' + srcpkg_names[0]),
        ('Source packages',
             ', '.join('%s %s' % (k, v) for (k, v) in versions.iteritems())),
        ('Date', today())])

    people = []
    res = []

    for srcpkg in srcpkg_names:
        pkg_cps = data.cps[data.cps['_srcpkg'] == srcpkg].ix[0]
        pkg_cpf = data.cpf[data.cpf['_srcpkg'] == srcpkg]
        pkg_licenses = data.licenses[data.licenses['_srcpkg'] == srcpkg]
        people.extend(list(extract_people(pkg_cps)))
        res.extend(list(extract_resources(pkg_cps)))

        pkg_cl = data.cl[data.cl['_srcpkg'] == srcpkg]
        cl_date = pkg_cl['date']
        # avoid rare bad index error:
        if cl_date.empty:
            continue
        cl_date = cl_date[0]
        cl_uploader = pkg_cl['author'][0]
        for template in srcpkg_extract_licenses(
                pkg_cps, pkg_cpf, pkg_licenses, cl_date, cl_uploader):
            # XXX: eliminate duplicates
            yield template

    for template in people:
        # XXX: eliminate duplicates
        yield template

    for template in res:
        # XXX: eliminate duplicates
        yield template

    #yield Template('Software category', [
    #    ('Resource kind', ''),
    #    ('Resource URL', '')])

def filename(s):
    s_ = re.sub('[^A-Za-z0-9_+.-]', '_', s)
    assert s_, s
    return s_ + '.wiki'

def output(path, xs):
    with open(path, 'w') as f:
        for x in xs:
            f.write(str(x) + '\n')

def output_multi(path, xs):
    "Output a bunch of pages to a directory with an index."

    index = {}

    if not os.path.exists(path):
        os.makedirs(path)

    for (name, templates) in xs:
        fname = filename(name)
        fpath = os.path.join(path, fname)
        index[fname] = {'page': name, 'file': fname}
        output(fpath, templates)

    fpath = os.path.join(path, 'index.json')
    json.dump(index, file(fpath, 'w'))

def uname_srcpkgs(data, name):
    pkg_cps = data.cps[data.cps['Upstream-Name'] == name]
    srcpkg_names = list(pkg_cps['_srcpkg'])
    return srcpkg_names

def export_all(data):
    """Export all packages.

    Returns a generator of (name, templates) tuples.
    """

    # First, find all upstream names and the source packages corresponding
    # to them.

    unames = sorted(set(data.cps['Upstream-Name'].dropna()))

    # For source packages with no upstream name, use the source package
    # name as the upstream name.

    no_uname = sorted(set(data.cps[
        data.cps['Upstream-Name'].isnull()]['_srcpkg']))

    packages = itertools.chain(
        ((uname, uname_srcpkgs(data, uname)) for uname in unames),
        ((srcpkg, [srcpkg]) for srcpkg in no_uname))

    for (name, srcpkgs) in packages:
        if not name:
            continue

        if '\n' in name:
            # Seriously?
            warn('bad name: %r' % name)
            continue

        # Generator; exceptions are delayed.
        templates = export_srcpkgs(data, name, srcpkgs)
        yield (name, templates)

def export_all_to_directory(data, outputdir):
    def _export():
        for (name, templates) in export_all(data):
            print (name.encode('utf8') if isinstance(name, unicode) else name)

            try:
                # Force errors.
                templates = list(templates)
            except ExportFailure, e:
                warn('export failed: %s: %s' % (name.encode('utf-8').strip(), e.message.encode('utf-8').strip()))

            yield(name, templates)

    output_multi(outputdir, _export())

def main():
    data = PkgData()
    args = sys.argv[1:]

    if len(args) == 0:
        export_all_to_directory(data, 'output')
    elif len(args) == 1:
        # XXX: assumes argument is an upstream name
        uname = args[0]
        srcpkgs = uname_srcpkgs(data, uname)
        templates = export_srcpkgs(data, uname, srcpkgs)

        for template in templates:
            print template
    else:
        raise RuntimeError()

if __name__ == '__main__':
    main()