version=0.2

[RULE-ORDER]
ABBREVIATION-KNOWN NUMBER-ORDINAL
URL URL-WWW URL-DOMAIN E-MAIL
WORD-PARPREFIX-PARSUFFIX WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND
ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE DATE-SHORT
NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN

[META-RULES]
SPLITTER=%
NUMBER-ORDINAL = \p{N}+-?(?: %ORDINALS% )(?:\Z|\P{Lu}|\P{Ll})$
ABBREVIATION-KNOWN = (?:\p{P}*)?(?:\A|[^\p{L}\.])((?:%ABBREVIATIONS%)\.)(?:\Z|\P{L})
#PREFIX = (?:\A|[^\p{Lu}\.]|[^\p{Ll}\.])(%PREFIXES% )(\p{L}+)
#SUFFIX = ((?:\p{L})+)( %SUFFIXES% )(?:\Z|\P{L})

[RULES]
%include url
%include e-mail
%include smiley

#Ex (oud)-studente(s)
WORD-PARPREFIX-PARSUFFIX=(?:\p{Ps}\p{L}+[\p{Pc}\p{Pd}]?\p{Pe}[\p{Pc}\p{Pd}]?)\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{Pd}]?\p{L}+\p{Pe})

#Ex: (oud)-studente, (on)zin,
WORD-PARPREFIX=(?:\p{Ps}\p{L}+[\p{Pc}\p{Pd}]?\p{Pe}[\p{Pc}\p{Pd}]?)\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*

#Ex: könig(in)
WORD-PARSUFFIX=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{Pd}]?\p{L}+\p{Pe})

#Keep dash/underscore connected parts (even if they are in parenthesis)
WORD-COMPOUND=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)+

#Abbreviations with multiple periods
ABBREVIATION=^(\p{L}{1,3}(?:\.\p{L}{1,3})+\.?)(?:\Z|[,:;])

#retain initials
INITIAL=^(?:\p{Lt}|\p{Lu})\.$

#Homogeneous punctuation (ellipsis etc)
PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}

#Date
DATE=\p{N}{4}-\p{N}{1,2}[\.-]\p{N}{1,2}\.?
DATE-SHORT=\p{N}{1,2}[-]\p{Ps}?\p{N}{1,2}[-]\p{Ps}?\p{N}{2,4}

FRACNUMBER=\p{N}+(?:/\p{N}+)+

NUMBER-YEAR=(['`’‘´]\p{N}{2})(?:\P{N}|\z)

#Times
TIME=\p{N}{1,2}\.\p{N}{1,2}(?:\.\p{N})?(?i:am|pm)?

#retain digits, including those starting with initial period (.22), and negative numbers
NUMBER=-?(?:[\.,]?\p{N}+)+

CURRENCY=\p{Sc}

WORD=[\p{L}\p{Mn}]+

PUNCTUATION=\p{P}

UNKNOWN=.

[PREFIXES]

[SUFFIXES]
['`’‘´][sS]
:[nN]
:[aA]
:ar
:AR
:arna
:ARNA
:[sS]

[ORDINALS]
:[aA]
:[eE]

[TOKENS]
['`’‘´][sS]

[UNITS]
km
m
cm
mm
g
kg
hg
cg
C
l
s
sec
min
gb
mb
kb
St


[CURRENCY]
SEK
EUR
DM
USD

[ABBREVIATIONS]
adr
aug
bl.a
ca
cg
dec
d.v.s
dvs
dr
dyl
el
enl
etc
&c
ex
febr
f.d
fg.å
f.k
f.n
f.v
forts
fr.o.m
gm
h
ha
hg
hr
i st.f
jfr
kg
kl
km
lr
m.fl
m
min
min
m.m
mm
mån
ngn
ngt
nr
o.dyl
o.likn
o.s.v
p.g.a
s.a.s
s.k
skn
st
t.ex
t.o.m
tfn
trpt
u.a
vard
v.g.v

[EOSMARKERS]
# Character: !
# Name: EXCLAMATION MARK
# Code: 33 (0x21)
\u0021

# Character: ?
# Name: QUESTION MARK
# Code: 3f (0x3f)
\u003F

# Character: ;
# Name: GREEK QUESTION MARK
# Code: 894 (0x37e)
\u037e

# Character: ؟
# Name: ARABIC QUESTION MARK
# Code: 1567 (0x61f)
\u061f

# Character: 。
# Name: IDEOGRAPHIC FULL STOP
# Code: 12290 (0x3002)
\u3002

# Character: ｡
# Name: HALFWIDTH IDEOGRAPHIC FULL STOP
# Code: 65377 (0xff61)
\uff61

# Character: ？
# Name: FULLWIDTH QUESTION MARK
# Code: 65311 (0xff1f)
\uff1f

# Character: ！
# Name: FULLWIDTH EXCLAMATION MARK
# Code: 65281 (0xff01)
\uff01

# Character: ।
# Name: DEVANAGARI DANDA
# Code: 2404 (0x964)
\u0964

# Character: ։
# Name: ARMENIAN FULL STOP
# Code: 1417 (0x589)
\u0589

# Character: ՞
# Name: ARMENIAN QUESTION MARK
# Code: 1374 (0x55e)
\u055e

# Character: ።
# Name: ETHIOPIC FULL STOP
# Code: 4962 (0x1362)
\u1362

# Character: ᙮
# Name: CANADIAN SYLLABICS FULL STOP
# Code: 5742 (0x166e)
\u166e

# Character: ។
# Name: KHMER SIGN KHAN
# Code: 6100 (0x17d4)
\u17d4

# Character: ៕
# Name: KHMER SIGN BARIYOOSAN
# Code: 6101 (0x17d5)
\u17d5

# Character: ᠃
# Name: MONGOLIAN FULL STOP
# Code: 6147 (0x1803)
\u1803

# Character: ᠉
# Name: MONGOLIAN MANCHU FULL STOP
# Code: 6153 (0x1809)
\u1809
