1/* Part of SWI-Prolog 2 3 Author: Jan Wielemaker 4 E-mail: J.Wielemaker@vu.nl 5 WWW: http://www.swi-prolog.org 6 Copyright (c) 2000-2020, University of Amsterdam 7 VU University Amsterdam 8 CWI, Amsterdam 9 All rights reserved. 10 11 Redistribution and use in source and binary forms, with or without 12 modification, are permitted provided that the following conditions 13 are met: 14 15 1. Redistributions of source code must retain the above copyright 16 notice, this list of conditions and the following disclaimer. 17 18 2. Redistributions in binary form must reproduce the above copyright 19 notice, this list of conditions and the following disclaimer in 20 the documentation and/or other materials provided with the 21 distribution. 22 23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 POSSIBILITY OF SUCH DAMAGE. 35*/ 36 37:- module(sgml, 38 [ load_html/3, % +Input, -DOM, +Options 39 load_xml/3, % +Input, -DOM, +Options 40 load_sgml/3, % +Input, -DOM, +Options 41 42 load_sgml_file/2, % +File, -ListOfContent 43 load_xml_file/2, % +File, -ListOfContent 44 load_html_file/2, % +File, -Document 45 46 load_structure/3, % +File, -Term, +Options 47 48 load_dtd/2, % +DTD, +File 49 load_dtd/3, % +DTD, +File, +Options 50 dtd/2, % +Type, -DTD 51 dtd_property/2, % +DTD, ?Property 52 53 new_dtd/2, % +Doctype, -DTD 54 free_dtd/1, % +DTD 55 open_dtd/3, % +DTD, +Options, -Stream 56 57 new_sgml_parser/2, % -Parser, +Options 58 free_sgml_parser/1, % +Parser 59 set_sgml_parser/2, % +Parser, +Options 60 get_sgml_parser/2, % +Parser, +Options 61 sgml_parse/2, % +Parser, +Options 62 63 sgml_register_catalog_file/2, % +File, +StartOrEnd 64 65 xml_quote_attribute/3, % +In, -Quoted, +Encoding 66 xml_quote_cdata/3, % +In, -Quoted, +Encoding 67 xml_quote_attribute/2, % +In, -Quoted 68 xml_quote_cdata/2, % +In, -Quoted 69 xml_name/1, % +In 70 xml_name/2, % +In, +Encoding 71 72 xsd_number_string/2, % ?Number, ?String 73 xsd_time_string/3, % ?Term, ?Type, ?String 74 75 xml_basechar/1, % +Code 76 xml_ideographic/1, % +Code 77 xml_combining_char/1, % +Code 78 xml_digit/1, % +Code 79 xml_extender/1, % +Code 80 81 iri_xml_namespace/2, % +IRI, -Namespace 82 iri_xml_namespace/3, % +IRI, -Namespace, -LocalName 83 xml_is_dom/1 % +Term 84 ]). 85:- autoload(library(error),[instantiation_error/1]). 86:- autoload(library(iostream),[open_any/5,close_any/1]). 87:- autoload(library(lists),[member/2,selectchk/3]). 88:- autoload(library(option),[select_option/3,merge_options/3]). 89 90:- meta_predicate 91 load_structure( , , ), 92 load_html( , , ), 93 load_xml( , , ), 94 load_sgml( , , ). 95 96:- predicate_options(load_structure/3, 3, 97 [ charpos(integer), 98 cdata(oneof([atom,string])), 99 defaults(boolean), 100 dialect(oneof([html,html4,html5,sgml,xhtml,xhtml5,xml,xmlns])), 101 doctype(atom), 102 dtd(any), 103 encoding(oneof(['iso-8859-1', 'utf-8', 'us-ascii'])), 104 entity(atom,atom), 105 keep_prefix(boolean), 106 file(atom), 107 line(integer), 108 offset(integer), 109 number(oneof([token,integer])), 110 qualify_attributes(boolean), 111 shorttag(boolean), 112 case_sensitive_attributes(boolean), 113 case_preserving_attributes(boolean), 114 system_entities(boolean), 115 max_memory(integer), 116 ignore_doctype(boolean), 117 space(oneof([sgml,preserve,default,remove,strict])), 118 xmlns(atom), 119 xmlns(atom,atom), 120 pass_to(sgml_parse/2, 2) 121 ]). 122:- predicate_options(load_html/3, 3, 123 [ pass_to(load_structure/3, 3) 124 ]). 125:- predicate_options(load_xml/3, 3, 126 [ pass_to(load_structure/3, 3) 127 ]). 128:- predicate_options(load_sgml/3, 3, 129 [ pass_to(load_structure/3, 3) 130 ]). 131:- predicate_options(load_dtd/3, 3, 132 [ dialect(oneof([sgml,xml,xmlns])), 133 pass_to(open/4, 4) 134 ]). 135:- predicate_options(sgml_parse/2, 2, 136 [ call(oneof([begin,end,cdata,pi,decl,error,xmlns,urlns]), 137 callable), 138 cdata(oneof([atom,string])), 139 content_length(integer), 140 document(-any), 141 max_errors(integer), 142 parse(oneof([file,element,content,declaration,input])), 143 source(any), 144 syntax_errors(oneof([quiet,print,style])), 145 xml_no_ns(oneof([error,quiet])) 146 ]). 147:- predicate_options(new_sgml_parser/2, 2, 148 [ dtd(any) 149 ]).
179:- multifile user:file_search_path/2. 180:- dynamic user:file_search_path/2. 181 182user:file_search_path(dtd, library('DTD')). 183 184sgml_register_catalog_file(File, Location) :- 185 prolog_to_os_filename(File, OsFile), 186 '_sgml_register_catalog_file'(OsFile, Location). 187 188:- use_foreign_library(foreign(sgml2pl)). 189 190register_catalog(Base) :- 191 absolute_file_name(dtd(Base), 192 [ extensions([soc]), 193 access(read), 194 file_errors(fail) 195 ], 196 SocFile), 197 sgml_register_catalog_file(SocFile, end). 198 199:- initialization 200 ignore(register_catalog('HTML4')). 201 202 203 /******************************* 204 * DTD HANDLING * 205 *******************************/ 206 207/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 208Note that concurrent access to DTD objects is not allowed, and hence we 209will allocate and destroy them in each thread. Possibibly it would be 210nicer to find out why concurrent access to DTD's is flawed. It is 211diagnosed to mess with the entity resolution by Fabien Todescato. 212- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ 213 214:- thread_local 215 current_dtd/2. 216:- volatile 217 current_dtd/2. 218:- thread_local 219 registered_cleanup/0. 220:- volatile 221 registered_cleanup/0. 222 223:- multifile 224 dtd_alias/2. 225 226:- create_prolog_flag(html_dialect, html5, [type(atom)]). 227 228dtd_alias(html4, 'HTML4'). 229dtd_alias(html5, 'HTML5'). 230dtd_alias(html, DTD) :- 231 current_prolog_flag(html_dialect, Dialect), 232 dtd_alias(Dialect, DTD).
dtd(Type)
. Loaded DTD
objects are cached. Note that DTD objects may not be shared
between threads. Therefore, dtd/2 maintains the pool of DTD
objects using a thread_local predicate. DTD objects are
destroyed if a thread terminates.
244dtd(Type, DTD) :- 245 current_dtd(Type, DTD), 246 !. 247dtd(Type, DTD) :- 248 new_dtd(Type, DTD), 249 ( dtd_alias(Type, Base) 250 -> true 251 ; Base = Type 252 ), 253 absolute_file_name(dtd(Base), 254 [ extensions([dtd]), 255 access(read) 256 ], DtdFile), 257 load_dtd(DTD, DtdFile), 258 register_cleanup, 259 asserta(current_dtd(Type, DTD)).
274load_dtd(DTD, DtdFile) :- 275 load_dtd(DTD, DtdFile, []). 276load_dtd(DTD, DtdFile, Options) :- 277 sgml_open_options(sgml:Options, OpenOptions, sgml:DTDOptions), 278 setup_call_cleanup( 279 open_dtd(DTD, DTDOptions, DtdOut), 280 setup_call_cleanup( 281 open(DtdFile, read, DtdIn, OpenOptions), 282 copy_stream_data(DtdIn, DtdOut), 283 close(DtdIn)), 284 close(DtdOut)).
291destroy_dtds :-
292 ( current_dtd(_Type, DTD),
293 free_dtd(DTD),
294 fail
295 ; true
296 ).
302register_cleanup :- 303 registered_cleanup, 304 !. 305register_cleanup :- 306 ( thread_self(main) 307 -> at_halt(destroy_dtds) 308 ; current_prolog_flag(threads, true) 309 -> prolog_listen(this_thread_exit, destroy_dtds) 310 ; true 311 ), 312 assert(registered_cleanup). 313 314 315 /******************************* 316 * EXAMINE DTD * 317 *******************************/ 318 319prop(doctype(_), _). 320prop(elements(_), _). 321prop(entities(_), _). 322prop(notations(_), _). 323prop(entity(E, _), DTD) :- 324 ( nonvar(E) 325 -> true 326 ; '$dtd_property'(DTD, entities(EL)), 327 member(E, EL) 328 ). 329prop(element(E, _, _), DTD) :- 330 ( nonvar(E) 331 -> true 332 ; '$dtd_property'(DTD, elements(EL)), 333 member(E, EL) 334 ). 335prop(attributes(E, _), DTD) :- 336 ( nonvar(E) 337 -> true 338 ; '$dtd_property'(DTD, elements(EL)), 339 member(E, EL) 340 ). 341prop(attribute(E, A, _, _), DTD) :- 342 ( nonvar(E) 343 -> true 344 ; '$dtd_property'(DTD, elements(EL)), 345 member(E, EL) 346 ), 347 ( nonvar(A) 348 -> true 349 ; '$dtd_property'(DTD, attributes(E, AL)), 350 member(A, AL) 351 ). 352prop(notation(N, _), DTD) :- 353 ( nonvar(N) 354 -> true 355 ; '$dtd_property'(DTD, notations(NL)), 356 member(N, NL) 357 ). 358 359dtd_property(DTD, Prop) :- 360 prop(Prop, DTD), 361 '$dtd_property'(DTD, Prop). 362 363 364 /******************************* 365 * SGML * 366 *******************************/
A proper XML document contains only a single toplevel element whose name matches the document type. Nevertheless, a list is returned for consistency with the representation of element content.
The encoding(+Encoding)
option is treated special for
compatibility reasons:
iso-8859-1
, us-ascii
or utf-8
,
the stream is opened in binary mode and the option is passed
to the SGML parser.390load_structure(Spec, DOM, Options) :- 391 sgml_open_options(Options, OpenOptions, SGMLOptions), 392 setup_call_cleanup( 393 open_any(Spec, read, In, Close, OpenOptions), 394 load_structure_from_stream(In, DOM, SGMLOptions), 395 close_any(Close)). 396 397sgml_open_options(Options, OpenOptions, SGMLOptions) :- 398 Options = M:Plain, 399 ( select_option(encoding(Encoding), Plain, NoEnc) 400 -> ( sgml_encoding(Encoding) 401 -> merge_options(NoEnc, [type(binary)], OpenOptions), 402 SGMLOptions = Options 403 ; OpenOptions = Plain, 404 SGMLOptions = M:NoEnc 405 ) 406 ; merge_options(Plain, [type(binary)], OpenOptions), 407 SGMLOptions = Options 408 ). 409 410sgml_encoding(Enc) :- 411 downcase_atom(Enc, Enc1), 412 sgml_encoding_l(Enc1). 413 414sgml_encoding_l('iso-8859-1'). 415sgml_encoding_l('us-ascii'). 416sgml_encoding_l('utf-8'). 417sgml_encoding_l('utf8'). 418sgml_encoding_l('iso_latin_1'). 419sgml_encoding_l('ascii'). 420 421load_structure_from_stream(In, Term, M:Options) :- 422 ( select_option(dtd(DTD), Options, Options1) 423 -> ExplicitDTD = true 424 ; ExplicitDTD = false, 425 Options1 = Options 426 ), 427 move_front(Options1, dialect(_), Options2), % dialect sets defaults 428 setup_call_cleanup( 429 new_sgml_parser(Parser, 430 [ dtd(DTD) 431 ]), 432 parse(Parser, M:Options2, TermRead, In), 433 free_sgml_parser(Parser)), 434 ( ExplicitDTD == true 435 -> ( DTD = dtd(_, DocType), 436 dtd_property(DTD, doctype(DocType)) 437 -> true 438 ; true 439 ) 440 ; free_dtd(DTD) 441 ), 442 Term = TermRead. 443 444move_front(Options0, Opt, Options) :- 445 selectchk(Opt, Options0, Options1), 446 !, 447 Options = [Opt|Options1]. 448move_front(Options, _, Options). 449 450 451parse(Parser, M:Options, Document, In) :- 452 set_parser_options(Options, Parser, In, Options1), 453 parser_meta_options(Options1, M, Options2), 454 set_input_location(Parser, In), 455 sgml_parse(Parser, 456 [ document(Document), 457 source(In) 458 | Options2 459 ]). 460 461set_parser_options([], _, _, []). 462set_parser_options([H|T], Parser, In, Rest) :- 463 ( set_parser_option(H, Parser, In) 464 -> set_parser_options(T, Parser, In, Rest) 465 ; Rest = [H|R2], 466 set_parser_options(T, Parser, In, R2) 467 ). 468 469set_parser_option(Var, _Parser, _In) :- 470 var(Var), 471 !, 472 instantiation_error(Var). 473set_parser_option(Option, Parser, _) :- 474 def_entity(Option, Parser), 475 !. 476set_parser_option(offset(Offset), _Parser, In) :- 477 !, 478 seek(In, Offset, bof, _). 479set_parser_option(Option, Parser, _In) :- 480 parser_option(Option), 481 !, 482 set_sgml_parser(Parser, Option). 483set_parser_option(Name=Value, Parser, In) :- 484 Option =.. [Name,Value], 485 set_parser_option(Option, Parser, In). 486 487 488parser_option(dialect(_)). 489parser_option(shorttag(_)). 490parser_option(case_sensitive_attributes(_)). 491parser_option(case_preserving_attributes(_)). 492parser_option(system_entities(_)). 493parser_option(max_memory(_)). 494parser_option(ignore_doctype(_)). 495parser_option(file(_)). 496parser_option(line(_)). 497parser_option(space(_)). 498parser_option(number(_)). 499parser_option(defaults(_)). 500parser_option(doctype(_)). 501parser_option(qualify_attributes(_)). 502parser_option(encoding(_)). 503parser_option(keep_prefix(_)). 504 505 506def_entity(entity(Name, Value), Parser) :- 507 get_sgml_parser(Parser, dtd(DTD)), 508 xml_quote_attribute(Value, QValue), 509 setup_call_cleanup(open_dtd(DTD, [], Stream), 510 format(Stream, '<!ENTITY ~w "~w">~n', 511 [Name, QValue]), 512 close(Stream)). 513def_entity(xmlns(URI), Parser) :- 514 set_sgml_parser(Parser, xmlns(URI)). 515def_entity(xmlns(NS, URI), Parser) :- 516 set_sgml_parser(Parser, xmlns(NS, URI)).
522parser_meta_options([], _, []). 523parser_meta_options([call(When, Closure)|T0], M, [call(When, M:Closure)|T]) :- 524 !, 525 parser_meta_options(T0, M, T). 526parser_meta_options([H|T0], M, [H|T]) :- 527 parser_meta_options(T0, M, T).
534set_input_location(Parser, _In) :- 535 get_sgml_parser(Parser, file(_)), 536 !. 537set_input_location(Parser, In) :- 538 stream_property(In, file_name(File)), 539 !, 540 set_sgml_parser(Parser, file(File)), 541 stream_property(In, position(Pos)), 542 set_sgml_parser(Parser, position(Pos)). 543set_input_location(_, _). 544 545 /******************************* 546 * UTILITIES * 547 *******************************/
556load_sgml_file(File, Term) :-
557 load_sgml(File, Term, []).
566load_xml_file(File, Term) :-
567 load_xml(File, Term, []).
576load_html_file(File, DOM) :-
577 load_html(File, DOM, []).
dtd(html, DTD)
.html_dialect
You may also want to use the library(http/http_open) to support loading from HTTP and HTTPS URLs. For example:
:- use_module(library(http/http_open)). :- use_module(library(sgml)). load_html_url(URL, DOM) :- load_html(URL, DOM, []).
606load_html(File, Term, M:Options) :-
607 current_prolog_flag(html_dialect, Dialect),
608 dtd(Dialect, DTD),
609 merge_options(Options,
610 [ dtd(DTD),
611 dialect(Dialect),
612 max_errors(-1),
613 syntax_errors(quiet)
614 ], Options1),
615 load_structure(File, Term, M:Options1).
dialect(xml)
625load_xml(Input, DOM, M:Options) :-
626 merge_options(Options,
627 [ dialect(xml)
628 ], Options1),
629 load_structure(Input, DOM, M:Options1).
dialect(sgml)
639load_sgml(Input, DOM, M:Options) :- 640 merge_options(Options, 641 [ dialect(sgml) 642 ], Options1), 643 load_structure(Input, DOM, M:Options1). 644 645 646 647 /******************************* 648 * ENCODING * 649 *******************************/
659xml_quote_attribute(In, Quoted) :- 660 xml_quote_attribute(In, Quoted, ascii). 661 662xml_quote_cdata(In, Quoted) :- 663 xml_quote_cdata(In, Quoted, ascii).
669xml_name(In) :- 670 xml_name(In, ascii). 671 672 673 /******************************* 674 * XML CHARACTER CLASSES * 675 *******************************/
690 /******************************* 691 * TYPE CHECKING * 692 *******************************/
699xml_is_dom(0) :- !, fail. % catch variables 700xml_is_dom(List) :- 701 is_list(List), 702 !, 703 xml_is_content_list(List). 704xml_is_dom(Term) :- 705 xml_is_element(Term). 706 707xml_is_content_list([]). 708xml_is_content_list([H|T]) :- 709 xml_is_content(H), 710 xml_is_content_list(T). 711 712xml_is_content(0) :- !, fail. 713xml_is_content(pi(Pi)) :- 714 !, 715 atom(Pi). 716xml_is_content(CDATA) :- 717 atom(CDATA), 718 !. 719xml_is_content(CDATA) :- 720 string(CDATA), 721 !. 722xml_is_content(Term) :- 723 xml_is_element(Term). 724 725xml_is_element(element(Name, Attributes, Content)) :- 726 dom_name(Name), 727 dom_attributes(Attributes), 728 xml_is_content_list(Content). 729 730dom_name(NS:Local) :- 731 atom(NS), 732 atom(Local), 733 !. 734dom_name(Local) :- 735 atom(Local). 736 737dom_attributes(0) :- !, fail. 738dom_attributes([]). 739dom_attributes([H|T]) :- 740 dom_attribute(H), 741 dom_attributes(T). 742 743dom_attribute(Name=Value) :- 744 dom_name(Name), 745 atomic(Value). 746 747 748 /******************************* 749 * MESSAGES * 750 *******************************/ 751:- multifile 752 prolog:message/3. 753 754% Catch messages. sgml/4 is generated by the SGML2PL binding. 755 756prologmessage(sgml(Parser, File, Line, Message)) --> 757 { get_sgml_parser(Parser, dialect(Dialect)) 758 }, 759 [ 'SGML2PL(~w): ~w:~w: ~w'-[Dialect, File, Line, Message] ]. 760 761 762 /******************************* 763 * XREF SUPPORT * 764 *******************************/ 765 766:- multifile 767 prolog:called_by/2. 768 769prologcalled_by(sgml_parse(_, Options), Called) :- 770 findall(Meta, meta_call_term(_, Meta, Options), Called). 771 772meta_call_term(T, G+N, Options) :- 773 T = call(Event, G), 774 pmember(T, Options), 775 call_params(Event, Term), 776 functor(Term, _, N). 777 778pmember(X, List) :- % member for partial lists 779 nonvar(List), 780 List = [H|T], 781 ( X = H 782 ; pmember(X, T) 783 ). 784 785call_params(begin, begin(tag,attributes,parser)). 786call_params(end, end(tag,parser)). 787call_params(cdata, cdata(cdata,parser)). 788call_params(pi, pi(cdata,parser)). 789call_params(decl, decl(cdata,parser)). 790call_params(error, error(severity,message,parser)). 791call_params(xmlns, xmlns(namespace,url,parser)). 792call_params(urlns, urlns(url,url,parser)). 793 794 /******************************* 795 * SANDBOX * 796 *******************************/ 797 798:- multifile 799 sandbox:safe_primitive/1, 800 sandbox:safe_meta_predicate/1. 801 802sandbox:safe_meta_predicate(sgml:load_structure/3). 803sandbox:safe_primitive(sgml:dtd(Dialect, _)) :- 804 dtd_alias(Dialect, _). 805sandbox:safe_primitive(sgml:xml_quote_attribute(_,_,_)). 806sandbox:safe_primitive(sgml:xml_quote_cdata(_,_,_)). 807sandbox:safe_primitive(sgml:xml_name(_,_)). 808sandbox:safe_primitive(sgml:xml_basechar(_)). 809sandbox:safe_primitive(sgml:xml_ideographic(_)). 810sandbox:safe_primitive(sgml:xml_combining_char(_)). 811sandbox:safe_primitive(sgml:xml_digit(_)). 812sandbox:safe_primitive(sgml:xml_extender(_)). 813sandbox:safe_primitive(sgml:iri_xml_namespace(_,_,_)). 814sandbox:safe_primitive(sgml:xsd_number_string(_,_)). 815sandbox:safe_primitive(sgml:xsd_time_string(_,_,_))
SGML, XML and HTML parser
This library allows you to parse SGML, XML and HTML data into a Prolog data structure. The library defines several families of predicates:
The DOM structure can be used by library(xpath) to extract information from the document.