/usr/share/link-grammar/de/4.0.regex

 %***************************************************************************%
 %                                                                           %
 %  Copyright (C) 2005, 2006 Sampo Pyysalo, Sophie Aubin                     %
 %  Copyright (C) 2009, 2012 Linas Vepstas                                   %
 %  See file "LICENSE" for information about commercial use of this system   %
 %                                                                           %
 %***************************************************************************%

% This file contains regular expressions that are used to match
% tokens not found in the dictionary. Each regex is given a name which
% determines the disjuncts assigned when the regex matches; this name
% must be defined in the dictionary along with the appropriate disjuncts.
% Note that the order of the regular expressions matters: matches will
% be attempted in the order in which the regexs appear in this file,
% and only the first match will be used.

% Numbers.
% XXX, we need to add utf8 U+00A0 "no-break space"
% 
% Allows at most two colons in hour-muinute-second HH:MM:SS expressions
% Allows at most two digits between colons
HMS-TIME: /^[0-9][0-9]?(:[0-9][0-9]?(:[0-9][0-9]?)?)?(AM|PM|am|pm)?$/

% e.g. 1950's leading number can be higher, for science fiction.
% Must be four digits, or possible three. Must end in s, 's ’s
DECADE-TIME: /^([1-4][0-9][0-9]|[1-9][0-9])0(s|'s|’s)$/

% Day-of-month names; this regex will match before the one below.
DAY-ORDINALS: /^(1st|2nd|3rd|[4-9]th|1[0-9]th|2(0th|1st|2nd|3rd|[4-9]th)|30th|31st)$/

% Ordinal numbers; everything except 1st through 13th
% is handled by regex.
ORDINALS: /^[1-9][0-9]*(0th|1st|2nd|3rd|[4-9]th)$/

% Allows any number of commas or periods
% Be careful not match the period at the end of a sentence; 
% for example: "It happened in 1942."
NUMBERS: /^[0-9,.]*[0-9]$/
% This parses signed numbers and ranges, e.g. "-5" and "5-10" and "9+/-6.5"
NUMBERS: /^[0-9.,-]*[0-9](\+\/-[0-9.,-]*[0-9])?$/
% Parses simple fractions e.g. "1/60" with no decimal points or anything fancy
FRACTION: /^[0-9]+\/[0-9]+$/
% "10(3)" exponent (used in PubMed)
NUMBERS: /^[0-9.,-]*[0-9][0-9.,-]*\([0-9:.,-]*[0-9][0-9.,-]*\)$/

% Roman numerals
% The first expr has the problem that it matches an empty string.
% Thus, the next four rules have at least one section is non-empty.
% (This is an issue for er in german, which is both a word and a suffix.)
% ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/
ROMAN-NUMERAL-WORDS: /^M+(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/
ROMAN-NUMERAL-WORDS: /^M*(CM|DC{0,3}|C{1,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/
ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD)(XC|LX{0,3}|X{1,3}|XL)(IX|V?I{0,3}|IV)$/
ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|VI{0,3}|I{1,3}|IV)$/
% The look-ahead below would be a better solution, than the four lines
% above, but it seems that neither gnu-regex nor bsd-regex support
% load-ahead, and so we can't use this.
% ROMAN-NUMERAL-WORDS: /^(?=.+)M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/

% Strings of initials. e.g "Dr. J.G.D. Smith lives on Main St."
INITIALS: /^([A-Z]\.)+$/

% Greek letters with numbers
GREEK-LETTER-AND-NUMBER: /^(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)\-?[0-9]+$/
PL-GREEK-LETTER-AND-NUMBER: /^(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)s\-?[0-9]+$/

% Some "safe" derived units. Simple units are in dictionary.
% The idea here is for the regex to match something that is almost
% certainly part of a derived unit, and allow the rest to be
% anything; this way we can capture difficult derived units such
% as "mg/kg/day" and even oddities such as "micrograms/mouse/day"
% without listing them explicitly.
% TODO: add more. 
% Some (real) misses from these: 
% micrograms.kg-1.h-1 microM-1 J/cm2 %/day mN/m cm/yr
% m/s days/week ml/s degrees/sec cm/sec cm/s mm/s N/mm (is that a unit?)
% cuts/minute clicks/s beats/minute x/week W/kg/W %/patient-year
% microIU/ml degrees/s counts/mm2 cells/mm3 tumors/mouse
% mm/sec ml/hr mJ/cm(2) m2/g amol/mm2 animals/group
% h-1 min-1 day-1 cm-1 mg-1 kg-1 mg.m-2.min-1 ms.cm-1 g-1
% sec-1 ms-1 ml.min.-1kg-1 ml.hr-1
% also, both kilometer and kilometers seem to be absent(!)
% remember "mm"!

UNITS: /^([npmk]|nano|pico|milli|micro|kilo)?(g|grams?)\//   % grams/anything
UNITS: /^([fnmp]|femto|nano|micro|pico|mu)?mol(es)?\//       % mol/anything
UNITS: /^[a-zA-Z\/.]+\/((m|micro)?[lLg]|kg|mol|min|day|h)$/  % common endings
% common endings, except in the style "mg.kg-1" instead of "mg/kg".
UNITS: /^[a-zA-Z\/.1-]+\.((m|micro)?[lLg]|kg|mol|min|day|h)(-1|\(-1\))$/

% combinations of numbers and units, e.g. "50-kDa", "1-2h"
% TODO: Clean up and check that these are up-to-date wrt the 
% dictionary-recognized units; this is quite a mess currently.
% TODO: Extend the "number" part of the regex to allow anything
% that the NUMBER regex matches.
% One problem here is a failure to split up the expression ... 
% e.g. "2hr" becomes 2 - ND - hr with the ND link. But 2-hr is treated
% as a single word ('I is a 2-hr wait')
% NUMBER-AND-UNIT: /^[0-9.,-]+(msec|s|min|hour|h|hr|day|week|wk|month|year|yr|kDa|kilodalton|base|kilobase|base-pair|kD|kd|kDa|bp|nt|kb|mm|mg|cm|nm|g|Hz|ms|kg|ml|mL|km|microm|\%)$/
% Comment out above, it screws up handling of unit suffixes, for
% example: "Zangbert stock fell 30% to $2.50 yesterday."


% Plural proper nouns.
% Make sure that apostrophe-s is split out correctly.
PL-CAPITALIZED-WORDS:  /^[[:upper:]].*[^iuoys'’]s$/

% Other proper nouns.
% We demand that these end with an alphanumeric, i.e. explicitly
% reject punctuation. We don't want this regex to "swallow" any trailing
% commas, colons, or periods/question-marks at the end of sentences.
% In addition, this must not swallow words ending in 's 'll etc.
% (... any affix, for that matter ...) and so no embedded apostrophe
CAPITALIZED-WORDS:     /^[[:upper:]][^'’]*[^[:punct:]]$/

% Sequence of punctuation marks. If some mark appears in the affix table
% such as a period, comma, dash or underscore, and there's a sequence of
% these, then treat it as a "fill-in-the-blank" placeholder.
% This matters only for punc. appearing in the affix table, since the
% tokenizer explicitly mangles based on these punctution marks.
% 
% Look for at least four in a row.
UNKNOWN-WORD: /^[.,-]{4}[.,-]*$/
link-grammar-dictionaries-all 5.3.14-1 / usr / share / link-grammar / de / 4.0.regex