Monday, March 14, 2011

text formatter

#!/usr/bin/env python
# -*- coding: utf8 -*-
#-------------------------------------------------------------------------
#File name : strip.py
#Author : lfchen (Full Name please)
#Created : Mon 14 Mar 2011 02:04:35 PM CST
#Description :
# :
#Notes :
#-------------------------------------------------------------------------
#Copyright 2011 (c)
#-------------------------------------------------------------------------
import sys
import re

coding = 'utf-8'
def strip(infile, outfile):
try:
INFILE = open(infile,"r")
except:
print "\tERROR: fail to open file %s for processing" % infile
return
OUTFILE = open(outfile,"w")
prev_line=""
for line in INFILE:
line=line.decode(coding).rstrip("\n")
if line.endswith(" "): #keep ending space to avoid unintended contatenation of strings
line=line.rstrip() + " "
else:
line=line.rstrip()
if re.match(r'^\s*$', line) \
or line.endswith(".") or line.endswith("!") or line.endswith("?") or line.endswith(":") or line.endswith(";") \
or line.endswith("。".decode(coding)) or line.endswith("!".decode(coding)) or line.endswith("?".decode(coding)) \
or line.endswith(":".decode(coding)) or line.endswith(";".decode(coding)):
line += "\n"
if ('A'<=line[0]<='Z' or u'A'<=line[0]<=u'Z') and not prev_line.endswith("\n"):
line = "\n" + line
OUTFILE.write(line.encode(coding))
prev_line=line
INFILE.close()
OUTFILE.close()

if __name__ == "__main__":
if ("-h" in sys.argv[:]):
print "Usage: %s <filename(s)>" % sys.argv[0]
sys.exit(0)
else:
for filename in sys.argv[1:]:
print "dealing with file %s" % filename
strip(filename, "formatted_"+filename)

No comments: