This file is indexed.

/usr/lib/ocaml/netstring/nethtml.mli is in libocamlnet-ocaml-dev 3.7.3-4.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
(* $Id: nethtml.mli 1561 2011-03-07 16:09:25Z gerd $
 * ----------------------------------------------------------------------
 *
 *)

(** Parsing of HTML *)


(** The type [document] represents parsed HTML documents:
 *
 * {ul
 * {- [Element (name, args, subnodes)] is an element node for an element of
 *   type [name] (i.e. written [<name ...>...</name>]) with arguments [args]
 *   and subnodes [subnodes] (the material within the element). The arguments
 *   are simply name/value pairs. Entity references (something like [&xy;])
 *   occuring in the values are {b not} resolved.
 *
 *   Arguments without values (e.g. [<select name="x" multiple>]: here,
 *   [multiple] is such an argument) are represented as [(name,name)], i.e. the
 *   name is also returned as value.
 *
 *   As argument names are case-insensitive, the names are all lowercase.}
 * {- [Data s] is a character data node. Again, entity references are contained
 *   as such and not as what they mean.}
 * }
 *
 * Character encodings: The parser is restricted to ASCII-compatible
 * encodings (see the function {!Netconversion.is_ascii_compatible} for
 * a definition). In order to read other encodings, the text must be
 * first recoded to an ASCII-compatible encoding (example below).
 * Names of elements and attributes must additionally be ASCII-only.
 *)
type document =
    Element of (string  *  (string*string) list  *  document list)
  | Data of string
;;


(** We also need a type that declares how to handle the various tags.
    This is called a "simplified DTD", as it is derived from SGML DTDs,
    but simplified to the extent used in the HTML definition.

    The HTML specification (http://www.w3.org/TR/1999/REC-html401-19991224)
    is the reference for the HTML DTD. You can see there that
    most HTML elements are either an inline element, a block element, or
    both ("flow" element). The grammar of HTML is described in terms of
    these classes. For instance, a [P] tag (paragraph) is a block element and
    contains block elements whereas [B] (bold) is an inline element and
    contains inline elements. From this follows that you cannot put a [P]
    inside a [B]: [<B><P>something</P></B>] is illegal.

    The parser needs this information to resolve such input, i.e. do
    something with bad HTML. As HTML allows tag minimization (many end tags
    can be omitted), the parser can read this as: [<B></B><P>something</P>]
    (and the [</B>] in the input is ignored).
    
    If all start and all end tags are written out, changing the
    simplified_dtd does not make any difference.

    There is no normative text that says how to read bad HTML. Because of
    this, it is - to a large degree - an interpretation of HTML what you put
    into [simplified_dtd]. We provide two versions:
    - [html40_dtd]: tries to be close to the official spec
    - [relaxed_html40_dtd]: tries to be close to what common web browsers
      implement
 *)

(* Now follows the type definition of simplified DTDs. *)

type element_class =         (* What is the class of an element? *)
  [ `Inline
  | `Block
  | `Essential_block
  | `None
  | `Everywhere
  ]
;;
(** Element classes are a property used in the HTML DTD. For our purposes,
 * we define element classes simply as an enumeration:
 * - [`Inline] is the class of inline HTML elements
 * - [`Block] is the class of block HTML elements
 * - [`Essential_block] is a sub-class of [`Block] with the additional
 *   property that every start tag must be explicitly ended
 * - [`None] means that the members of the class are neither block nor
 *   inline elements, but have to be handled specially
 * - [`Everywhere] means that the members of the class can occur everywhere, 
 *   regardless of whether a constraint allows it or not.
 *)


type model_constraint =      (* The constraint the subelements must fulfill *)
  [ `Inline
  | `Block
  | `Flow                                            (* = `Inline or `Block *)
  | `Empty
  | `Any
  | `Special
  | `Elements of string list             (* Enumeration of allowed elements *)
  | `Or of (model_constraint * model_constraint)
  | `Except of (model_constraint * model_constraint)
  | `Sub_exclusions of (string list * model_constraint)
  ]
;;
(** Model constraints define the possible sub elements of an element:
 * - [`Inline]: The sub elements must belong to the class [`Inline]
 * - [`Block]: The sub elements must be members of the classes [`Block] or 
 *   [`Essential_block]
 * - [`Flow]: The sub elements must belong to the classes [`Inline], [`Block],
 *   or [`Essential_block]
 * - [`Empty]: There are no sub elements
 * - [`Any]: Any sub element is allowed
 * - [`Special]: The element has special content (e.g. [<script>]).
 *   Functionally equivalent to [`Empty]
 * - [`Elements l]: Only these enumerated elements may occur as sub elements
 * - [`Or(m1,m2)]: One of the constraints [m1] or [m2] must hold
 * - [`Except(m1,m2)]: The constraint [m1] must hold, and [m2] must not hold
 * - [`Sub_exclusions(l,m)]: The constraint [m] must hold; furthermore, 
 *   the elements enumerated in list [l] are not allowed as direct or
 *   indirect subelements, even if [m] or the model of a subelement would
 *   allow them. The difference to [`Except(m, `Elements l)] is that the
 *   exclusion is inherited to the subelements. The [`Sub_exclusions]
 *   expression must be toplevel, i.e. it must not occur within an [`Or], 
 *   [`Except], or another ['Sub_exclusions] expression.
 *
 * Note that the members of the class [`Everywhere] are allowed everywhere,
 * regardless of whether the model constraint allows them or not.
 *
 * Note that certain aspects are not modeled:
 * - [#PCDATA]: We do not specify where PCDATA is allowed and where not.
 * - Order, Number: We do neither specify in which order the sub elements must
 *   occur nor how often they can occur
 * - Inclusions: DTDs may describe that an element extraordinarily
 *   allows a list of elements in all sub elements. 
 * - Optional tags: Whether start or end tags can be omitted (to some extent,
 *   this can be expressed with [`Essential_block], however)
 *)

type simplified_dtd =
    (string * (element_class * model_constraint)) list;;
(** A [simplified_dtd] is an associative list of tuples
 *  [(element_name, (element_class, constraint))]: For every [element_name]
 *  it is declared that it is a member of [element_class], and that
 *  the sub elements must satisfy [constraint].
 *
 *  It is not allowed to have several entries for the same element.
 *)

val html40_dtd : simplified_dtd
  (** The (transitional) HTML 4.0 DTD, expressed as [simplified_dtd] *)

val relaxed_html40_dtd : simplified_dtd
  (** A relaxed version of the HTML 4.0 DTD that matches better common
   * practice. In particular, this DTD additionally allows that inline
   * elements may span blocks. For example, 
   * {[ <B>text1 <P>text2 ]}
   * is parsed as
   * {[ <B>text1 <P>text2</P></B> ]}
   * and not as
   * {[ <B>text1 </B><P>text2</P> ]}
   * \- the latter is more correct (and parsed by [html40_dtd]), but is not what
   * users expect.
   *
   * Note that this is still not what many browsers implement. For example,
   * Netscape treats most inline tags specially: [<B>] switches bold on,
   * [</B>] switches bold off. For example,
   * {[ <A href='a'>text1<B>text2<A href='b'>text3 ]}
   * is parsed as
   * {[ <A href='a'>text1<B>text2</B></A><B><A href='b'>text3</A></B> ]}
   * \- there is an extra [B] element around the second anchor! (You can
   * see what Netscape parses by loading a page into the "Composer".)
   * IMHO it is questionable to consider inline tags as switches because
   * this is totally outside of the HTML specification, and browsers may
   * differ in that point.
   *
   * Furthermore, several elements are turned into essential blocks:
   * [TABLE], [UL], [OL], and [DL]. David Fox reported a problem with structures
   * like:
   * {[ <TABLE><TR><TD><TABLE><TR><TD>x</TD></TD></TR></TABLE>y</TD></TR></TABLE> ]}
   * i.e. the [TD] of the inner table has two end tags. Without additional
   * help, the second [</TD>] would close the outer table cell. Because of
   * this problem, tables are now essential meaning that it is not allowed
   * to implicitly add a missing [</TABLE>]; every table element has to
   * be explicitly ended. This rule seems to be what many browsers implement.
   *)

val parse_document : ?dtd:simplified_dtd ->            (* default: html40_dtd *)
                     ?return_declarations:bool ->      (* default: false *)
                     ?return_pis:bool ->               (* default: false *)
                     ?return_comments:bool ->          (* default: false *)
                     Lexing.lexbuf ->
                       document list
  (** Parses the HTML document from a [lexbuf] and returns it. 
   * 
   * @param dtd specifies the DTD to use. By default, [html40_dtd] is used which
   *   bases on the transitional HTML 4.0 DTD
   * @param return_declarations if set, the parser returns [<!...>] declarations
   *   as [Element("!",["contents",c],[])] nodes, where [c] is the string inside
   *   [<!] and [>]. - By default, declarations are skipped.
   * @param return_pis if set, the parser returns [<?...>] (or [<?...?>]) processing
   *   instructions as [Element("?",["contents",c],[])] nodes, where [c] is the
   *   string inside [<?] and [>] (or [?>]). - By default, processing instructions
   *   are skipped.
   * @param return_comments if set, the parser returns [<!--] .... [-->] comments
   *   as [Element("--",["contents",c],[])] nodes, where [c] is the string inside
   *   [<!--] and [-->]. - By default, comments are skipped.
   *)

val parse : ?dtd:simplified_dtd ->            (* default: html40_dtd *)
            ?return_declarations:bool ->      (* default: false *)
            ?return_pis:bool ->               (* default: false *)
            ?return_comments:bool ->          (* default: false *)
              Netchannels.in_obj_channel ->
                document list
  (** Parses the HTML document from an object channel and returns it.
   * For example, to parse the HTML string [s]:
   * {[
   * let ch = Netchannels.input_string s in
   * let doc = parse ch
   * ]}
   *
   * Arguments are the same as in [parse_document].
   *)

(** {b Note on XHTML}
 *
 * The parser can read XHTML, as long as the following XML features are not
 * used:
 * - Internal DTD subset, i.e. [<!DOCTYPE html ... [ ... ]>]
 * - External entities
 * - [<!\[CDATA\[]
 * - [<!\[INCLUDE\[]
 * - [<!\[IGNORE\[]
 *
 * The following XML features are ok:
 * - Processing instructions
 * - Empty elements (e.g. [<br/>]) as long as the element is declared as 
 *   [`Empty].
 *)

(** {b Note on Character Encodings}
 *
 * The parser can only read character streams that are encoded in an ASCII-
 * compatible way. For example, it is possible to read a UTF-8-encoded
 * stream, but not a UTF-16-encoded stream. All bytes between 1 and 127
 * are taken as ASCII, and other bytes are ignored (copied from input
 * to output).
 *
 * Non-ASCII-compatible streams must be recoded first. For example, to
 * read a UTF-16-encoded netchannel [ch], use:
 *
 * {[
 * let p = 
 *   new Netconversion.recoding_pipe ~in_enc:`Enc_utf16 ~out_enc:`Enc_utf8 () in
 * let ch' =
 *   new Netchannels.input_filter ch p in
 * let doc =
 *   Nethtml.parse ch' in
 * ch' # close_in();
 * ch # close_in();
 * ]}
 *)


val decode : 
      ?enc:Netconversion.encoding ->           (* default: `Enc_iso88591 *)
      ?subst:(int -> string) ->                (* default: failure *)
      ?entity_base:[ `Html | `Xml | `Empty ] -> 
      ?lookup:(string -> string) ->
      ?dtd:simplified_dtd ->
      document list -> document list
  (** Converts entities [&name;] and [&#num;] into the corresponding 
   * characters. The argument [enc] must indicate the character set of
   * the document (by default ISO-8859-1 for backwards compatibility).
   * If a character cannot be represented in this encoding, the function
   * [subst] is called (input is the Unicode code point, output is the
   * substituted string). By default, the function fails if such a 
   * character is found.
   *
   * The arg [entity_base] selects which entities can be converted
   * (see {!Netencoding.Html.decode}). The function [lookup] is called
   * for all unknown [&name;] entities. By default, this function fails.
   *
   * Declarations, processing instructions, and comments are not
   * decoded. The same also applies to elements declared as [`Special]
   * in the DTD. The [dtd] argument determines the DTD, by default
   * [html40_dtd] is assumed.
   *)

val encode : 
      ?enc:Netconversion.encoding ->           (* default: `Enc_iso88591 *)
      ?prefer_name:bool ->                     (* default: true *)
      ?dtd:simplified_dtd ->
      document list -> document list
  (** Converts problematic characters to their corresponding
   * entities. The argument [enc] must indicate the character set of
   * the document (by default ISO-8859-1 for backwards compatibility).
   * If [prefer_name], the algorithm tries to find the named entities
   * ([&name;]); otherwise only numeric entities ([&#num;]) are generated.
   * Names are preferred by default.
   * 
   * Declarations, processing instructions, and comments are not
   * encoded. The same also applies to elements declared as [`Special]
   * in the DTD. The [dtd] argument determines the DTD, by default
   * [html40_dtd] is assumed.
   *)

val map_list : (string -> string) -> document list -> document list
  (** [map_list f doclst]:
   * Applies [f] to all attribute values and data strings (except
   * the attributes of "?", "!", or "--" nodes). 
   *
   * This can be used to change the text encoding of a parsed document:
   * {[
   * let doc' = map_list String.lowercase doc
   * ]}
   * converts all text data to lowercase characters. 
   *)

type xmap_value =
  | Xmap_attribute of string * string * string
  | Xmap_data of string option * string

val xmap_list : (xmap_value -> string) -> string option ->
                   document list -> document list
  (** [xmap_list f surrounding_element_opt doclst]: Similar to [map_list],
    * the function [f] is applied to all attribute values and data strings.
    * Unlike [map_list], more information is passed to the callback function
    * [f]. This function is called with an [xmap_value] argument:
    * - [Xmap_attribute(ename,aname,aval)]: The function is called for an
    *   attribute value of element [ename]. The attribute is [aname] and
    *   has the value [aval]. The function must return the new value of
    *   the attribute (i.e. [aval']).
    * - [Xmap_data(ename_opt,data)]: The function is called for a data
    *   node surrounded by an element [ename_opt] (which is [None] if the
    *   data node is the outermost node). The string [data] is the value
    *   of the data node. The function must return the new value of the
    *   data node (i.e. [data']).
    *
    * [xmap_list] is invoked with [surrounding_element_opt] which is the
    * name of the surrounding element, or [None] if such an element does 
    * not exist, or is unknown.
   *)


val write : ?dtd:simplified_dtd ->            (* default: html40_dtd *) 
            ?xhtml:bool ->
            Netchannels.out_obj_channel ->
            document list ->
	      unit
  (** Writes the document to the output channel. No additional encoding or
   * decoding happens.
   *
   * Empty elements are written without end tag (see also optional argument
   * [xhtml]); the rest is written unabbreviated.
   *
   * Example: To write the document to a file:
   * {[
   * let f = open_out "filename" in
   * let ch = new Netchannels.output_channel f in
   * write ch doc;
   * ch # close_out()
   * ]}
   *
   * @param dtd The assumed simplified DTD, by default [html40_dtd]
   * @param xhtml makes the output compatible with XHTML 1.0 Strict by
   * closing [`Empty] tags with "/>" ([true] by default).
   *)