36
37:- module(sgml,
38 [ load_html/3, 39 load_xml/3, 40 load_sgml/3, 41
42 load_sgml_file/2, 43 load_xml_file/2, 44 load_html_file/2, 45
46 load_structure/3, 47
48 load_dtd/2, 49 load_dtd/3, 50 dtd/2, 51 dtd_property/2, 52
53 new_dtd/2, 54 free_dtd/1, 55 open_dtd/3, 56
57 new_sgml_parser/2, 58 free_sgml_parser/1, 59 set_sgml_parser/2, 60 get_sgml_parser/2, 61 sgml_parse/2, 62
63 sgml_register_catalog_file/2, 64
65 xml_quote_attribute/3, 66 xml_quote_cdata/3, 67 xml_quote_attribute/2, 68 xml_quote_cdata/2, 69 xml_name/1, 70 xml_name/2, 71
72 xsd_number_string/2, 73 xsd_time_string/3, 74
75 xml_basechar/1, 76 xml_ideographic/1, 77 xml_combining_char/1, 78 xml_digit/1, 79 xml_extender/1, 80
81 iri_xml_namespace/2, 82 iri_xml_namespace/3, 83 xml_is_dom/1 84 ]). 85:- autoload(library(error),[instantiation_error/1]). 86:- autoload(library(iostream),[open_any/5,close_any/1]). 87:- autoload(library(lists),[member/2,selectchk/3]). 88:- autoload(library(option),[select_option/3,merge_options/3]). 89
90:- meta_predicate
91 load_structure(+, -, :),
92 load_html(+, -, :),
93 load_xml(+, -, :),
94 load_sgml(+, -, :). 95
96:- predicate_options(load_structure/3, 3,
97 [ charpos(integer),
98 cdata(oneof([atom,string])),
99 defaults(boolean),
100 dialect(oneof([html,html4,html5,sgml,xhtml,xhtml5,xml,xmlns])),
101 doctype(atom),
102 dtd(any),
103 encoding(oneof(['iso-8859-1', 'utf-8', 'us-ascii'])),
104 entity(atom,atom),
105 keep_prefix(boolean),
106 file(atom),
107 line(integer),
108 offset(integer),
109 number(oneof([token,integer])),
110 qualify_attributes(boolean),
111 shorttag(boolean),
112 case_sensitive_attributes(boolean),
113 case_preserving_attributes(boolean),
114 system_entities(boolean),
115 max_memory(integer),
116 ignore_doctype(boolean),
117 space(oneof([sgml,preserve,default,remove,strict])),
118 xmlns(atom),
119 xmlns(atom,atom),
120 pass_to(sgml_parse/2, 2)
121 ]). 122:- predicate_options(load_html/3, 3,
123 [ pass_to(load_structure/3, 3)
124 ]). 125:- predicate_options(load_xml/3, 3,
126 [ pass_to(load_structure/3, 3)
127 ]). 128:- predicate_options(load_sgml/3, 3,
129 [ pass_to(load_structure/3, 3)
130 ]). 131:- predicate_options(load_dtd/3, 3,
132 [ dialect(oneof([sgml,xml,xmlns])),
133 pass_to(open/4, 4)
134 ]). 135:- predicate_options(sgml_parse/2, 2,
136 [ call(oneof([begin,end,cdata,pi,decl,error,xmlns,urlns]),
137 callable),
138 cdata(oneof([atom,string])),
139 content_length(integer),
140 document(-any),
141 max_errors(integer),
142 parse(oneof([file,element,content,declaration,input])),
143 source(any),
144 syntax_errors(oneof([quiet,print,style])),
145 xml_no_ns(oneof([error,quiet]))
146 ]). 147:- predicate_options(new_sgml_parser/2, 2,
148 [ dtd(any)
149 ]). 150
151
178
179:- multifile user:file_search_path/2. 180:- dynamic user:file_search_path/2. 181
182user:file_search_path(dtd, library('DTD')).
183
184sgml_register_catalog_file(File, Location) :-
185 prolog_to_os_filename(File, OsFile),
186 '_sgml_register_catalog_file'(OsFile, Location).
187
188:- use_foreign_library(foreign(sgml2pl)). 189
190register_catalog(Base) :-
191 absolute_file_name(dtd(Base),
192 [ extensions([soc]),
193 access(read),
194 file_errors(fail)
195 ],
196 SocFile),
197 sgml_register_catalog_file(SocFile, end).
198
199:- initialization
200 ignore(register_catalog('HTML4')). 201
202
203 206
213
214:- thread_local
215 current_dtd/2. 216:- volatile
217 current_dtd/2. 218:- thread_local
219 registered_cleanup/0. 220:- volatile
221 registered_cleanup/0. 222
223:- multifile
224 dtd_alias/2. 225
226:- create_prolog_flag(html_dialect, html5, [type(atom)]). 227
228dtd_alias(html4, 'HTML4').
229dtd_alias(html5, 'HTML5').
230dtd_alias(html, DTD) :-
231 current_prolog_flag(html_dialect, Dialect),
232 dtd_alias(Dialect, DTD).
233
243
244dtd(Type, DTD) :-
245 current_dtd(Type, DTD),
246 !.
247dtd(Type, DTD) :-
248 new_dtd(Type, DTD),
249 ( dtd_alias(Type, Base)
250 -> true
251 ; Base = Type
252 ),
253 absolute_file_name(dtd(Base),
254 [ extensions([dtd]),
255 access(read)
256 ], DtdFile),
257 load_dtd(DTD, DtdFile),
258 register_cleanup,
259 asserta(current_dtd(Type, DTD)).
260
273
274load_dtd(DTD, DtdFile) :-
275 load_dtd(DTD, DtdFile, []).
276load_dtd(DTD, DtdFile, Options) :-
277 sgml_open_options(sgml:Options, OpenOptions, sgml:DTDOptions),
278 setup_call_cleanup(
279 open_dtd(DTD, DTDOptions, DtdOut),
280 setup_call_cleanup(
281 open(DtdFile, read, DtdIn, OpenOptions),
282 copy_stream_data(DtdIn, DtdOut),
283 close(DtdIn)),
284 close(DtdOut)).
285
290
291destroy_dtds :-
292 ( current_dtd(_Type, DTD),
293 free_dtd(DTD),
294 fail
295 ; true
296 ).
297
301
302register_cleanup :-
303 registered_cleanup,
304 !.
305register_cleanup :-
306 ( thread_self(main)
307 -> at_halt(destroy_dtds)
308 ; current_prolog_flag(threads, true)
309 -> prolog_listen(this_thread_exit, destroy_dtds)
310 ; true
311 ),
312 assert(registered_cleanup).
313
314
315 318
319prop(doctype(_), _).
320prop(elements(_), _).
321prop(entities(_), _).
322prop(notations(_), _).
323prop(entity(E, _), DTD) :-
324 ( nonvar(E)
325 -> true
326 ; '$dtd_property'(DTD, entities(EL)),
327 member(E, EL)
328 ).
329prop(element(E, _, _), DTD) :-
330 ( nonvar(E)
331 -> true
332 ; '$dtd_property'(DTD, elements(EL)),
333 member(E, EL)
334 ).
335prop(attributes(E, _), DTD) :-
336 ( nonvar(E)
337 -> true
338 ; '$dtd_property'(DTD, elements(EL)),
339 member(E, EL)
340 ).
341prop(attribute(E, A, _, _), DTD) :-
342 ( nonvar(E)
343 -> true
344 ; '$dtd_property'(DTD, elements(EL)),
345 member(E, EL)
346 ),
347 ( nonvar(A)
348 -> true
349 ; '$dtd_property'(DTD, attributes(E, AL)),
350 member(A, AL)
351 ).
352prop(notation(N, _), DTD) :-
353 ( nonvar(N)
354 -> true
355 ; '$dtd_property'(DTD, notations(NL)),
356 member(N, NL)
357 ).
358
359dtd_property(DTD, Prop) :-
360 prop(Prop, DTD),
361 '$dtd_property'(DTD, Prop).
362
363
364 367
389
390load_structure(Spec, DOM, Options) :-
391 sgml_open_options(Options, OpenOptions, SGMLOptions),
392 setup_call_cleanup(
393 open_any(Spec, read, In, Close, OpenOptions),
394 load_structure_from_stream(In, DOM, SGMLOptions),
395 close_any(Close)).
396
397sgml_open_options(Options, OpenOptions, SGMLOptions) :-
398 Options = M:Plain,
399 ( select_option(encoding(Encoding), Plain, NoEnc)
400 -> ( sgml_encoding(Encoding)
401 -> merge_options(NoEnc, [type(binary)], OpenOptions),
402 SGMLOptions = Options
403 ; OpenOptions = Plain,
404 SGMLOptions = M:NoEnc
405 )
406 ; merge_options(Plain, [type(binary)], OpenOptions),
407 SGMLOptions = Options
408 ).
409
410sgml_encoding(Enc) :-
411 downcase_atom(Enc, Enc1),
412 sgml_encoding_l(Enc1).
413
414sgml_encoding_l('iso-8859-1').
415sgml_encoding_l('us-ascii').
416sgml_encoding_l('utf-8').
417sgml_encoding_l('utf8').
418sgml_encoding_l('iso_latin_1').
419sgml_encoding_l('ascii').
420
421load_structure_from_stream(In, Term, M:Options) :-
422 ( select_option(dtd(DTD), Options, Options1)
423 -> ExplicitDTD = true
424 ; ExplicitDTD = false,
425 Options1 = Options
426 ),
427 move_front(Options1, dialect(_), Options2), 428 setup_call_cleanup(
429 new_sgml_parser(Parser,
430 [ dtd(DTD)
431 ]),
432 parse(Parser, M:Options2, TermRead, In),
433 free_sgml_parser(Parser)),
434 ( ExplicitDTD == true
435 -> ( DTD = dtd(_, DocType),
436 dtd_property(DTD, doctype(DocType))
437 -> true
438 ; true
439 )
440 ; free_dtd(DTD)
441 ),
442 Term = TermRead.
443
444move_front(Options0, Opt, Options) :-
445 selectchk(Opt, Options0, Options1),
446 !,
447 Options = [Opt|Options1].
448move_front(Options, _, Options).
449
450
451parse(Parser, M:Options, Document, In) :-
452 set_parser_options(Options, Parser, In, Options1),
453 parser_meta_options(Options1, M, Options2),
454 set_input_location(Parser, In),
455 sgml_parse(Parser,
456 [ document(Document),
457 source(In)
458 | Options2
459 ]).
460
461set_parser_options([], _, _, []).
462set_parser_options([H|T], Parser, In, Rest) :-
463 ( set_parser_option(H, Parser, In)
464 -> set_parser_options(T, Parser, In, Rest)
465 ; Rest = [H|R2],
466 set_parser_options(T, Parser, In, R2)
467 ).
468
469set_parser_option(Var, _Parser, _In) :-
470 var(Var),
471 !,
472 instantiation_error(Var).
473set_parser_option(Option, Parser, _) :-
474 def_entity(Option, Parser),
475 !.
476set_parser_option(offset(Offset), _Parser, In) :-
477 !,
478 seek(In, Offset, bof, _).
479set_parser_option(Option, Parser, _In) :-
480 parser_option(Option),
481 !,
482 set_sgml_parser(Parser, Option).
483set_parser_option(Name=Value, Parser, In) :-
484 Option =.. [Name,Value],
485 set_parser_option(Option, Parser, In).
486
487
488parser_option(dialect(_)).
489parser_option(shorttag(_)).
490parser_option(case_sensitive_attributes(_)).
491parser_option(case_preserving_attributes(_)).
492parser_option(system_entities(_)).
493parser_option(max_memory(_)).
494parser_option(ignore_doctype(_)).
495parser_option(file(_)).
496parser_option(line(_)).
497parser_option(space(_)).
498parser_option(number(_)).
499parser_option(defaults(_)).
500parser_option(doctype(_)).
501parser_option(qualify_attributes(_)).
502parser_option(encoding(_)).
503parser_option(keep_prefix(_)).
504
505
506def_entity(entity(Name, Value), Parser) :-
507 get_sgml_parser(Parser, dtd(DTD)),
508 xml_quote_attribute(Value, QValue),
509 setup_call_cleanup(open_dtd(DTD, [], Stream),
510 format(Stream, '<!ENTITY ~w "~w">~n',
511 [Name, QValue]),
512 close(Stream)).
513def_entity(xmlns(URI), Parser) :-
514 set_sgml_parser(Parser, xmlns(URI)).
515def_entity(xmlns(NS, URI), Parser) :-
516 set_sgml_parser(Parser, xmlns(NS, URI)).
517
521
522parser_meta_options([], _, []).
523parser_meta_options([call(When, Closure)|T0], M, [call(When, M:Closure)|T]) :-
524 !,
525 parser_meta_options(T0, M, T).
526parser_meta_options([H|T0], M, [H|T]) :-
527 parser_meta_options(T0, M, T).
528
529
533
534set_input_location(Parser, _In) :-
535 get_sgml_parser(Parser, file(_)),
536 !.
537set_input_location(Parser, In) :-
538 stream_property(In, file_name(File)),
539 !,
540 set_sgml_parser(Parser, file(File)),
541 stream_property(In, position(Pos)),
542 set_sgml_parser(Parser, position(Pos)).
543set_input_location(_, _).
544
545 548
555
556load_sgml_file(File, Term) :-
557 load_sgml(File, Term, []).
558
565
566load_xml_file(File, Term) :-
567 load_xml(File, Term, []).
568
575
576load_html_file(File, DOM) :-
577 load_html(File, DOM, []).
578
605
606load_html(File, Term, M:Options) :-
607 current_prolog_flag(html_dialect, Dialect),
608 dtd(Dialect, DTD),
609 merge_options(Options,
610 [ dtd(DTD),
611 dialect(Dialect),
612 max_errors(-1),
613 syntax_errors(quiet)
614 ], Options1),
615 load_structure(File, Term, M:Options1).
616
624
625load_xml(Input, DOM, M:Options) :-
626 merge_options(Options,
627 [ dialect(xml)
628 ], Options1),
629 load_structure(Input, DOM, M:Options1).
630
638
639load_sgml(Input, DOM, M:Options) :-
640 merge_options(Options,
641 [ dialect(sgml)
642 ], Options1),
643 load_structure(Input, DOM, M:Options1).
644
645
646
647 650
658
659xml_quote_attribute(In, Quoted) :-
660 xml_quote_attribute(In, Quoted, ascii).
661
662xml_quote_cdata(In, Quoted) :-
663 xml_quote_cdata(In, Quoted, ascii).
664
668
669xml_name(In) :-
670 xml_name(In, ascii).
671
672
673 676
688
689
690 693
698
699xml_is_dom(0) :- !, fail. 700xml_is_dom(List) :-
701 is_list(List),
702 !,
703 xml_is_content_list(List).
704xml_is_dom(Term) :-
705 xml_is_element(Term).
706
707xml_is_content_list([]).
708xml_is_content_list([H|T]) :-
709 xml_is_content(H),
710 xml_is_content_list(T).
711
712xml_is_content(0) :- !, fail.
713xml_is_content(pi(Pi)) :-
714 !,
715 atom(Pi).
716xml_is_content(CDATA) :-
717 atom(CDATA),
718 !.
719xml_is_content(CDATA) :-
720 string(CDATA),
721 !.
722xml_is_content(Term) :-
723 xml_is_element(Term).
724
725xml_is_element(element(Name, Attributes, Content)) :-
726 dom_name(Name),
727 dom_attributes(Attributes),
728 xml_is_content_list(Content).
729
730dom_name(NS:Local) :-
731 atom(NS),
732 atom(Local),
733 !.
734dom_name(Local) :-
735 atom(Local).
736
737dom_attributes(0) :- !, fail.
738dom_attributes([]).
739dom_attributes([H|T]) :-
740 dom_attribute(H),
741 dom_attributes(T).
742
743dom_attribute(Name=Value) :-
744 dom_name(Name),
745 atomic(Value).
746
747
748 751:- multifile
752 prolog:message/3. 753
755
756prolog:message(sgml(Parser, File, Line, Message)) -->
757 { get_sgml_parser(Parser, dialect(Dialect))
758 },
759 [ 'SGML2PL(~w): ~w:~w: ~w'-[Dialect, File, Line, Message] ].
760
761
762 765
766:- multifile
767 prolog:called_by/2. 768
769prolog:called_by(sgml_parse(_, Options), Called) :-
770 findall(Meta, meta_call_term(_, Meta, Options), Called).
771
772meta_call_term(T, G+N, Options) :-
773 T = call(Event, G),
774 pmember(T, Options),
775 call_params(Event, Term),
776 functor(Term, _, N).
777
778pmember(X, List) :- 779 nonvar(List),
780 List = [H|T],
781 ( X = H
782 ; pmember(X, T)
783 ).
784
785call_params(begin, begin(tag,attributes,parser)).
786call_params(end, end(tag,parser)).
787call_params(cdata, cdata(cdata,parser)).
788call_params(pi, pi(cdata,parser)).
789call_params(decl, decl(cdata,parser)).
790call_params(error, error(severity,message,parser)).
791call_params(xmlns, xmlns(namespace,url,parser)).
792call_params(urlns, urlns(url,url,parser)).
793
794 797
798:- multifile
799 sandbox:safe_primitive/1,
800 sandbox:safe_meta_predicate/1. 801
802sandbox:safe_meta_predicate(sgml:load_structure/3).
803sandbox:safe_primitive(sgml:dtd(Dialect, _)) :-
804 dtd_alias(Dialect, _).
805sandbox:safe_primitive(sgml:xml_quote_attribute(_,_,_)).
806sandbox:safe_primitive(sgml:xml_quote_cdata(_,_,_)).
807sandbox:safe_primitive(sgml:xml_name(_,_)).
808sandbox:safe_primitive(sgml:xml_basechar(_)).
809sandbox:safe_primitive(sgml:xml_ideographic(_)).
810sandbox:safe_primitive(sgml:xml_combining_char(_)).
811sandbox:safe_primitive(sgml:xml_digit(_)).
812sandbox:safe_primitive(sgml:xml_extender(_)).
813sandbox:safe_primitive(sgml:iri_xml_namespace(_,_,_)).
814sandbox:safe_primitive(sgml:xsd_number_string(_,_)).
815sandbox:safe_primitive(sgml:xsd_time_string(_,_,_))