#!/usr/bin/python
# -*- coding: utf-8 -*

"""Unix filter for naive phonological analysis of words in wordlists.

Given a list of words, one word per line, on stdin, phon will dump
onsets, codas, consonants between vowels (middles) and all consonant
groupings (pieces).
"""

__author__ =  'kaleissin'
__version__=  '0.1a2'


import sys, re

s = u'[+aeiouyāēīōūȳäëïöüÿàèìòùỳáéíóúý]'
s_re = re.compile(s)

def _pre(word):
    word = word.strip().decode('utf-8').lower()
    if not word or word[0] in s:
        return 
    return word

def onsets(word, complex=False):
    """Find the onset of a word

    That is, in the word "strengths", show the "str"-part.

    Keyword arguments:
    complex -- if True, only look for onsets of two or more characters
    """
    word = _pre(word)
    if not word:
        return
    onset = s_re.split(word)[0]
    if complex and len(onset) < 2:
        return 
    return (onset,)

def codas(word, complex=False):
    """Find the coda of a word

    That is, in the word "strengths", show the "ngths"-part.

    Keyword arguments:
    complex -- if True, only look for onsets of two or more characters
    """
    word = _pre(word)
    if not word:
        return
    codas = s_re.split(word)[-1]
    if complex and len(codas) < 2:
        return
    return (codas,)

def middles(word, complex=False):
    """Find the middle consonants of a word
    
    That is, in the word "disestablishmentarianism", show the "s", "st",
    "bl", "shm", "nt", "r", and "n".

    Keyword arguments:
    complex -- if True, only look for onsets of two or more characters
    """
    word = _pre(word)
    if not word:
        return
    pieces = s_re.split(word)
    if len(pieces) < 3:
        return
    pieces = pieces[1:-1]
    if complex:
        return filter(None, [p for p in pieces if len(p) > 1])
    return filter(None, pieces)

def pieces(word, complex=False):
    """Find all the consonants of a word

    That is, in the word "disestablishmentarianism", show all of "d",
    "s", "st", "bl", "shm", "nt", "r", "n" and "sm".

    Keyword arguments:
    complex -- if True, only look for onsets of two or more characters
    """
    word = _pre(word)
    if not word:
        return
    pieces = s_re.split(word)
    if not pieces:
        return
    if complex:
        return filter(None, [p for p in pieces if len(p) > 1])
    return filter(None, pieces)

def _help(error=False):
    txt = """phon <OPTION> [ -c | -h ]

Where OPTION is one of "onsets", "codas", "middles", "pieces".

    -c: Only onsets|codas|middles|pieces longer than one character
    -h: Show this help-message and exit
    """
    if error:
        print >>sys.stderr, txt 
        sys.exit(1)
    else:
        print txt
        raise SystemExit

if __name__ == '__main__':
    complex = False
    args = set(sys.argv[1:])

    if not args or len(args) > 2:
        _help(True)

    if u'-h' in args:
        _help()
    if u'-c' in args:
        complex=True
        args.discard(u'-c')
    if len(args) > 1:
        _help()

    action = args.pop()

    actions = {'middles': middles,
            'codas': codas,
            'onsets': onsets,
            'pieces': pieces
            }

    if action not in actions.keys():
        action = 'pieces'

    for word in sys.stdin:
        pieces = actions[action](word, complex)
        if pieces:
            for piece in pieces:
                print piece.encode('utf-8')
