#!python

import sys

from cvutils import Corpora 
from cvutils import Segmenter
from cvutils import Validator
from cvutils import Alphabet 
from cvutils import Phonemiser

from cvutils import wikipedia

debug = False

if len(sys.argv) == 1:
	print('covo   dump     [pages-articles.xml.bz2]',file=sys.stderr)
	print('       dump-url [locale]',file=sys.stderr)
	print('       segment  [locale]',file=sys.stderr)
	print('       norm     [locale]',file=sys.stderr)
	print('       phon     [locale]',file=sys.stderr)
	print('       opus     [locale]',file=sys.stderr)
	sys.exit(-1)

mode = sys.argv[1]

if mode == 'dump':
	dump_name = sys.argv[2]
	if dump_name == '-':
		dump_name = '/dev/stdin'
	wikipedia.process(dump_name)

if mode == 'dump-url':
	dump_locale = sys.argv[2]
	c = Corpora(dump_locale)
	print(c.dump_url())
	
if mode == 'segment':
	locale = sys.argv[2]
	s = Segmenter(locale)
	line = sys.stdin.readline()
	while line:
		for sentence in s.segment(line):
			print(sentence)	
		
		line = sys.stdin.readline()

elif mode == 'validate' or mode == 'norm':
	locale = sys.argv[2]
	def cleanup(skipped, chars, alphabet, count_valid, total):
		if debug:
			print('',file=sys.stderr)
			print('\n'.join(skipped), file=sys.stderr)
			print('',file=sys.stderr)
			print(chars - alphabet, file=sys.stderr)			
		print('%d/%d (%.2f%%)' % (count_valid, total, (count_valid/total)*100.0),file=sys.stderr)
	

	v = Validator(locale)
	a = Alphabet(locale)
	line = sys.stdin.readline()
	chars = set()
	alphabet = set(a.get_alphabet())
	skipped = []
	count_valid = 0
	total = 0
	try: 
		while line:
			(valid, sent) = v.normalise(line)
			if valid:
				count_valid += 1
				print(sent)
			else:
				skipped.append(sent)
				[chars.add(c) for c in sent]
			total += 1	
			line = sys.stdin.readline()
	except KeyboardInterrupt:
		cleanup(skipped, chars, alphabet, count_valid, total)

	cleanup(skipped, chars, alphabet, count_valid, total)

elif mode == 'phon' or mode == 'phonemise':
	locale = sys.argv[2]
	p = Phonemiser(locale)
	line = sys.stdin.readline()
	while line:
		print(' '.join([p.phonemise(w) for w in line.split(' ')]))
		line = sys.stdin.readline()


elif mode == 'opus':
	locale = sys.argv[2]
	c = Corpora(locale)
	for line in c.opus():
		print('%s\t%s' % (line[1], line[0]))

