#
#   Copyright (C) 2002-2005, International Business Machines Corporation and others.
#       All Rights Reserved.
#
#   file:  sent.txt
#
#   ICU Sentence Break Rules
#      See Unicode Standard Annex #29.
#      These rules are based on TR 29 version 4.0.0
#


#
# Character categories as defined in TR 29
#
$Sep       = [\p{Sentence_Break = Sep}];
$Format    = [\p{Sentence_Break = Format}];
$Sp        = [\p{Sentence_Break = Sp}];
$Lower     = [\p{Sentence_Break = Lower}];
$Upper     = [\p{Sentence_Break = Upper}];
$OLetter   = [\p{Sentence_Break = OLetter}];
$Numeric   = [\p{Sentence_Break = Numeric}];
$ATerm     = [\p{Sentence_Break = ATerm}];
$Term      = [\p{Sentence_Break = STerm}];
$Close     = [\p{Sentence_Break = Close}];

#
# Define extended forms of the character classes,
#   incorporate grapheme cluster + format chars.

$Extend     = [[:Grapheme_Extend = TRUE:]];
$ATermEx    = $ATerm   $Extend* $Format*;
$NumericEx  = $Numeric $Extend* $Format*;
$UpperEx    = $Upper   $Extend* $Format*;
$TermEx     = $Term    $Extend* $Format*;

#
#  $SepSeq keeps together CRLF as a separator.  (CRLF is a grapheme cluster)
#
$SepSeq  = $Sep | \u000d\u000a;

# $InteriorChars are those that never trigger a following break.
$InteriorChars = [^$Term $ATerm $Sep];   #Note:  includes Extend and Format chars

## -------------------------------------------------

!!forward;

# Rule 6.  Match an ATerm (.) that does not cause a break because a number immediately follows it.
$NumberFollows = $InteriorChars* $ATermEx $NumericEx;


# Rule 7.  $UppersSurround   Match a no-break sentence fragment containing a . surrounded by Uppers
$UppersSurround = $InteriorChars* $UpperEx $ATermEx $UpperEx;

# Rule 8   Matches a sentence fragment containing "." that should not cause a sentence break,
#          because a lower case word follows the period.
$LowerWordFollows  = $InteriorChars* $ATermEx $Close* $Sp* [^$OLetter $Upper $Lower $Sep]* $Lower;

# Rules 3, 9, 10, 11
#                       Matches a simple sentence, or the trailing part of a complex sentence,
#                       where a simple sentence contains no interior "."s.
$TermEndSequence   = $InteriorChars* ($TermEx | $ATermEx) $Close* $Sp* $SepSeq?;
$EndSequence       = $InteriorChars* $SepSeq?;

# Put them all together.
($NumberFollows | $UppersSurround |  $LowerWordFollows)*  $TermEndSequence{0};   # status = UBRK_SENTENCE_TERM
($NumberFollows | $UppersSurround |  $LowerWordFollows)*  $EndSequence{100};     # status = UBRK_SENTENCE_SEP

## -------------------------------------------------

!!reverse;

# rule 6

$RULE6 = $Numeric $Format* $Extend* $ATerm;

# rule 7

$RULE7 = $Upper $Format* $Extend* $ATerm $Format* $Extend* $Upper;

# rule 8

$RULE8 = $Lower ($Format* $Extend* [^$OLetter $Upper $Lower $Sep])* 
             ($Format* $Extend* $Sp)* ($Format* $Extend* $Close)*
             $Format* $Extend* $ATerm;

# rule 9, 10, 11

# $CR $LF
$End = $Sep | \u000a\u000d
       | $Format* $Extend* $Sp* $Format* $Extend* $Close* $Format* 
		 $Extend* ($Term | $ATerm)
	   | $Sep $Format* $Extend* $Sp* $Format* $Extend* $Close* $Format* 
		 $Extend* ($Term | $ATerm);
	
# rule 12

$RULE12 = [^$Sep $Term $ATerm];

$Join = ($RULE6 | $RULE7 | $RULE8 | $RULE12)*;

$End;

$End? $Join [$RULE12 - $Sp - $Close];

# forces a break at the beginning of text "$Sp blah blah blah"
# remember the break iterators takes the longest match
$NOT_T_A_S_C = [^$Term $ATerm $Sp $Close];
$End? $Join $Sp / [$NOT_T_A_S_C {eof}];

# forces a break at the beginning of text "$Close blah blah blah"
$NOT_T_A_C = [^$Term $ATerm $Close];
$End? $Join $Close / [$NOT_T_A_C {eof}];

## -------------------------------------------------

!!safe_reverse;

# rule 4
$Extend+ [^$Extend];

# rule 7
$Extend* $ATerm $Format* $Extend* $Upper;

# rule 8
($Extend* $Term)+ ($Extend* $Sp $Format*)* ($Extend* $Close $Format*)* $Extend* $ATerm;

# rule 11
($Extend* $Sp $Format*)* ($Extend* $Close $Format*)*;
($Extend* $Sp $Format*)* ($Extend* $Close $Format*)* $Extend* ($Term | $ATerm);

## -------------------------------------------------

!!safe_forward;

# rule 7

$ATerm $Extend* $Format* $Upper;

# rule 8

$Lower .;

# rule 11

($Close $Extend* $Format*)* ($Sp $Extend* $Format*)*;