View source with formatted comments or as raw
    1/*  Part of ClioPatria SeRQL and SPARQL server
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2010-2018, University of Amsterdam,
    7                              VU University Amsterdam
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(url_cache,
   37          [ url_cache/3,                % +URI, -File, -MimeType
   38            url_cache_file/4,           % +URL, +Dir, +Ext, -Path)
   39            url_cache_delete/1,         % +URI
   40            url_cached/2,               % ?URL, ?Property
   41            url_cached/3,               % +Dir, ?URL, ?Property
   42            url_cache_reset_server_status/0,
   43            url_cache_reset_server_status/1 % +Server
   44          ]).   45:- use_module(library(http/http_open)).   46:- if(exists_source(library(http/http_ssl_plugin))).   47:- use_module(library(http/http_ssl_plugin)).   48:- endif.   49:- use_module(library(http/mimetype)).   50:- use_module(library(url)).   51:- use_module(library(debug)).   52:- use_module(library(error)).   53:- use_module(library(settings)).   54:- use_module(library(base64)).   55:- use_module(library(utf8)).   56:- use_module(library(lists)).   57:- use_module(library(sha)).   58
   59:- setting(cache:url_cache_directory, atom, 'cache/url',
   60           'Directory to cache fetched remote URLs').   61
   62/** <module> Cache the content of external URLs in local files
   63
   64This library provides a cache  for  data   stored  in  extenal URLs. The
   65content of each URL is kept in a  file and described by a meta-file that
   66remembers the mime-type, the original URL, when   it was fetched and -if
   67provided by the server- the last-modified stamp.
   68
   69@tbd    The current implementation does not validate the cache content, nor
   70        does it honour the HTTP cache directives.
   71*/
   72
   73
   74%!  url_cache(+URI:atom, -Path:atom, -MimeType:atom) is det.
   75%
   76%   Return the content of URI in  a   file  at Path. MimeType is the
   77%   Mime-type returned by the server.
   78%
   79%   @error  existence_error(url, URL)
   80%           Server did not respond with 200 OK
   81%   @error  existence_error(source_sink, url_cache(.))
   82%           Cache directory does not exist
   83%   @bug    Does not check modification time and cache validity
   84
   85url_cache(URL, Path, MimeType) :-
   86    url_cache_dir(Dir),
   87    url_cache_file(URL, Dir, url, Path),
   88    atom_concat(Path, '.meta', TypeFile),
   89    (   exists_file(Path),
   90        exists_file(TypeFile),
   91        read_meta_file(TypeFile, mime_type(MimeType0))
   92    ->  MimeType = MimeType0
   93    ;   fetch_url(URL, Path, MimeType, Modified),
   94        get_time(NowF),
   95        Now is round(NowF),
   96        open(TypeFile, write, Out,
   97             [ encoding(utf8),
   98               lock(write)
   99             ]),
  100        format(Out,
  101               'mime_type(~q).~n\c
  102                    url(~q).~n\c
  103                    fetched(~q).~n',
  104               [MimeType, URL, Now]),
  105        (   nonvar(Modified)
  106        ->  format(Out, 'last_modified(~q).~n', [Modified])
  107        ;   true
  108        ),
  109        close(Out)
  110    ).
  111
  112read_meta_file(MimeFile, Term) :-
  113    setup_call_cleanup(open(MimeFile, read, In,
  114                            [ encoding(utf8),
  115                              lock(read)
  116                            ]),
  117                       ndet_read(In, Term),
  118                       close(In)).
  119
  120ndet_read(Stream, Term) :-
  121    repeat,
  122    read(Stream, Term0),
  123    (   Term0 == end_of_file
  124    ->  !, fail
  125    ;   Term = Term0
  126    ).
  127
  128%!  url_cache_delete(+URL) is det.
  129%
  130%   Delete an URL from the cache. Succeeds,  even if the cache files
  131%   do not exist.
  132%
  133%   @error  Throws exceptions from delete_file/1 other than
  134%           existence errors.
  135
  136url_cache_delete(URL) :-
  137    url_cache_dir(Dir),
  138    url_cache_file(URL, Dir, url, Path),
  139    atom_concat(Path, '.meta', TypeFile),
  140    catch(delete_file(TypeFile), E0, true),
  141    catch(delete_file(Path), E1, true),
  142    error_ok(E0),
  143    error_ok(E1).
  144
  145error_ok(E) :-
  146    subsumes_term(error(existence_error(file, _), _), E),
  147    !.
  148error_ok(E) :-
  149    throw(E).
  150
  151%!  url_cache_dir(-Dir) is det
  152%
  153%   Return or create the URL caching directory
  154
  155url_cache_dir(Dir) :-
  156    setting(cache:url_cache_directory, Dir),
  157    make_directory_path(Dir).
  158
  159%!  make_directory_path(+Dir) is det.
  160%
  161%   Create Dir and all required components.
  162
  163make_directory_path(Dir) :-
  164    make_directory_path_2(Dir),
  165    !.
  166make_directory_path(Dir) :-
  167    permission_error(create, directory, Dir).
  168
  169make_directory_path_2(Dir) :-
  170    exists_directory(Dir),
  171    !.
  172make_directory_path_2(Dir) :-
  173    Dir \== (/),
  174    !,
  175    file_directory_name(Dir, Parent),
  176    make_directory_path_2(Parent),
  177    make_directory(Dir).
  178
  179%!  fetch_url(+URL:atom, +Path:atom, -MimeType:atom) is det.
  180%
  181%   @error  existence_error(url, URL)
  182
  183fetch_url(URL, File, MimeType, Modified) :-
  184    parse_url_ex(URL, Parts),
  185    server(Parts, Server),
  186    (   allow(Server)
  187    ->  true
  188    ;   throw(error(existence_error(url, URL),
  189                    context(url_cache/3, 'Too many errors from server')))
  190    ),
  191    get_time(Now),
  192    (   catch(fetch_url_raw(URL, File,
  193                            MimeType, Modified), E, true)
  194    ->  (   var(E)
  195        ->  register_stats(Server, Now, true)
  196        ;   register_stats(Server, Now, error(E)),
  197            throw(E)
  198        )
  199    ;   register_stats(Server, Now, false)
  200    ).
  201
  202server(Parts, Server) :-
  203    memberchk(host(Host), Parts),
  204    !,
  205    (   memberchk(port(Port), Parts)
  206    ->  Server = Host:Port
  207    ;   Server = Host
  208    ).
  209server(_,_) :-
  210    assertion(false).
  211
  212/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  213Server status assessment. We keep a   health-status  of the server using
  214the following rules:
  215
  216    * Range -100 .. 100
  217    * Ok if > 0
  218    * The initial status is 100 (healthy)
  219    * Possitive results add 20-4*sqrt(Time)
  220    * Negative results subtract 10
  221    * Add 1 per minute since last status.
  222
  223TBD: frequency matters: requests should not pile up.
  224- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
  225
  226:- dynamic
  227    server_status/3.                % Server, Status, Last
  228
  229allow(Server) :-
  230    server_status(Server, Status),
  231    debug(url_cache, 'Status ~q: ~w', [Server, Status]),
  232    Status > 0.
  233
  234server_status(Server, Status) :-
  235    get_time(Now),
  236    with_mutex(url_cache_status,
  237               server_status(Server, S0, T0)),
  238    !,
  239    Status is min(100, S0 + round(Now-T0)//60).
  240server_status(_, 100).
  241
  242register_stats(Server, Start, Result) :-
  243    get_time(Now),
  244    Time is Now - Start,
  245    (   server_status(Server, S0, T0)
  246    ->  true
  247    ;   S0 = 100,
  248        T0 = Now
  249    ),
  250    Since is Start - T0,
  251    update_status(Result, Time, Since, S0, S1),
  252    with_mutex(url_cache_status,
  253               (   retractall(server_status(Server, _, _)),
  254                   assert(server_status(Server, S1, Start)))).
  255
  256update_status(true, Time, Since, S0, S) :-
  257    !,
  258    S is min(100, S0 + round(20-4*sqrt(Time)) + round(Since)//60).
  259update_status(_, Time, _Since, S0, S) :-
  260    !,
  261    S is max(-100, S0 - (10 + round(Time))).
  262
  263
  264%!  url_cache_reset_server_status is det.
  265%!  url_cache_reset_server_status(+Server) is det.
  266%
  267%   Reset the status of the given server or all servers.
  268
  269url_cache_reset_server_status :-
  270    with_mutex(url_cache_status,
  271               retractall(server_status(_,_,_))).
  272url_cache_reset_server_status(Server) :-
  273    must_be(atom, Server),
  274    with_mutex(url_cache_status,
  275               retractall(server_status(Server,_,_))).
  276
  277
  278%!  fetch_url_raw(+URL:atom, +Path:atom, -MimeType:atom, -Modified) is det.
  279%
  280%   Fetch data from URL and put it   into the file Path. MimeType is
  281%   unified  with  the  MIME-type  as  reported  by  the  server  or
  282%   text/plain if the server did not provide a MIME-Type.
  283%
  284%   @error  existence_error(url, URL)
  285
  286fetch_url_raw(URL, File, MimeType, Modified) :-
  287    debug(url_cache, 'Downloading ~w ...', [URL]),
  288    atom_concat(File, '.tmp', TmpFile),
  289    (   catch(fetch_to_file(URL, TmpFile, Code, Header), E, true)
  290    ->  true
  291    ;   E = predicate_failed(http_get/3)
  292    ),
  293    (   var(E)
  294    ->  true
  295    ;   (   debugging(url_cache)
  296        ->  print_message(error, E)
  297        ;   true
  298        ),
  299        catch(delete_file(TmpFile), _, true),
  300        (   debugging(url_cache)
  301        ->  message_to_string(E, Msg),
  302            debug(url_cache, 'Download failed: ~w', [Msg])
  303        ;   true
  304        ),
  305        throw(E)
  306    ),
  307    (   Code == 200
  308    ->  rename_file(TmpFile, File)
  309    ;   catch(delete_file(TmpFile), _, true),
  310        throw(error(existence_error(url, URL), _))
  311    ),
  312    (   memberchk(content_type(MimeType0), Header)
  313    ->  true
  314    ;   MimeType0 = 'text/plain'
  315    ),
  316    ignore(memberchk(last_modified(Modified), Header)),
  317    debug(url_cache, 'Downloaded ~w, mime-type: ~w',
  318          [URL, MimeType0]),
  319    MimeType = MimeType0.
  320
  321fetch_to_file(URL, File, Code,
  322              [ content_type(ContentType),
  323                last_modified(LastModified)
  324              ]) :-
  325    setup_call_cleanup(
  326        open(File, write, Out, [ type(binary) ]),
  327        setup_call_cleanup(
  328            http_open(URL, In,
  329                      [ header(content_type, ContentType),
  330                        header(last_modified, LastModified),
  331                        status_code(Code),
  332                        cert_verify_hook(ssl_verify)
  333                      ]),
  334            copy_stream_data(In, Out),
  335            close(In)),
  336        close(Out)).
  337
  338:- public ssl_verify/5.  339
  340%!  ssl_verify(+SSL, +ProblemCert, +AllCerts, +FirstCert, +Error)
  341%
  342%   Currently we accept  all  certificates.
  343
  344ssl_verify(_SSL,
  345           _ProblemCertificate, _AllCertificates, _FirstCertificate,
  346           _Error).
  347
  348parse_url_ex(URL, Parts) :-
  349    is_list(URL),
  350    !,
  351    Parts = URL.
  352parse_url_ex(URL, Parts) :-
  353    parse_url(URL, Parts),
  354    !.
  355parse_url_ex(URL, _) :-
  356    domain_error(url, URL).
  357
  358%!  url_cache_file(+URL, +Dir, +Ext, -Path) is det
  359%
  360%   Determine location of cache-file for the   given  URL in Dir. If
  361%   Ext is provided, the  returned  Path   is  ensured  to  have the
  362%   specified extension.
  363
  364url_cache_file(URL, Dir, Ext, Path) :-
  365    url_to_file(URL, Ext, File),
  366    sub_atom(File, 0, 2, _, L1),
  367    ensure_dir(Dir, L1, Dir1),
  368    sub_atom(File, 2, 2, _, L2),
  369    ensure_dir(Dir1, L2, Dir2),
  370    sub_atom(File, 4, _, 0, LocalFile),
  371    atomic_list_concat([Dir2, /, LocalFile], Path).
  372
  373ensure_dir(D0, Sub, Dir) :-
  374    atomic_list_concat([D0, /, Sub], Dir),
  375    (   exists_directory(Dir)
  376    ->  true
  377    ;   make_directory(Dir)
  378    ).
  379
  380%!  url_to_file(+URL, +Ext, -File) is det.
  381%
  382%   File is a filename for storing URL and has extension Ext. We use
  383%   a cryptographic hash to ensure consistent naming, a name that is
  384%   guaranteed to fit in every sensible filesystem and ensure a good
  385%   distribution of the cache directories.
  386
  387url_to_file(URL, Ext, File) :-
  388    sha_hash(URL, Hash, []),
  389    phrase(hex_digits(Hash), Codes),
  390    string_to_list(String, Codes),
  391    file_name_extension(String, Ext, File).
  392
  393hex_digits([]) -->
  394    "".
  395hex_digits([H|T]) -->
  396    byte(H),
  397    hex_digits(T).
  398
  399byte(Byte) -->
  400    { High is (Byte>>4) /\ 0xf,
  401      Low is (Byte /\ 0xf),
  402      code_type(H, xdigit(High)),
  403      code_type(L, xdigit(Low))
  404    },
  405    [H,L].
  406
  407
  408                 /*******************************
  409                 *          READ CACHE          *
  410                 *******************************/
  411
  412%!  url_cached(?URL, ?Property) is nondet.
  413%!  url_cached(+Dir, ?URL, ?Property) is nondet.
  414%
  415%   True if URL is in the cache represented by the directory Dir and
  416%   has Property.  Defined properties are:
  417%
  418%       * file(-File)
  419%       File is the cache-file for the given URL
  420%       * mime_type(-Mime)
  421%       Mime is the mime-type of the URL as reported by the server
  422%       * fetched(-Stamp:integer)
  423%       Timestamp that specifies when the URL was fetched
  424%       * last_modified(-Modified:atom)
  425%       If present, this is the modification time as provided by
  426%       the server.
  427
  428url_cached(URL, Property) :-
  429    url_cache_dir(Dir),
  430    url_cached(Dir, URL, Property).
  431
  432url_cached(Dir, URL, Property) :-
  433    nonvar(URL),
  434    !,
  435    url_cache_file(URL, Dir, url, Path),
  436    atom_concat(Path, '.meta', MetaFile),
  437    exists_file(MetaFile),
  438    cache_file_property(Property, MetaFile).
  439url_cached(Dir, URL, Property) :-
  440    nonvar(Property),
  441    Property = file(File),
  442    atom(File),
  443    atom_concat(Dir, Rest, File),
  444    \+ sub_atom(Rest, _, _, _, '../'),
  445    file_name_extension(Base, url, File),
  446    file_name_extension(Base, meta, MetaFile),
  447    exists_file(MetaFile),
  448    once(read_meta_file(MetaFile, url(URL))).
  449url_cached(Dir, URL, Property) :-
  450    atom_concat(Dir, '/??', TopPat),
  451    expand_file_name(TopPat, TopDirs),
  452    member(TopDir, TopDirs),
  453    atom_concat(TopDir, '/??', DirPat),
  454    expand_file_name(DirPat, FileDirs),
  455    member(FileDir, FileDirs),
  456    atom_concat(FileDir, '/*.meta', FilePat),
  457    expand_file_name(FilePat, MetaFiles),
  458    member(MetaFile, MetaFiles),
  459    once(read_meta_file(MetaFile, url(URL))),
  460    check_cache_file(MetaFile, URL),
  461    cache_file_property(Property, MetaFile).
  462
  463check_cache_file(MetaFile, URL) :-
  464    file_name_extension(File, meta, MetaFile),
  465    (   exists_file(File)
  466    ->  true
  467    ;   print_message(warning, url_cache(no_file(File, MetaFile, URL))),
  468        delete_file(MetaFile),
  469        fail
  470    ).
  471
  472cache_file_property(Property, MetaFile) :-
  473    var(Property),
  474    !,
  475    cache_file_property_ndet(Property, MetaFile).
  476cache_file_property(Property, MetaFile) :-
  477    cache_file_property_ndet(Property, MetaFile),
  478    !.
  479
  480
  481cache_file_property_ndet(file(File), MetaFile) :-
  482    file_name_extension(File, meta, MetaFile).
  483cache_file_property_ndet(P, MetaFile) :-
  484    read_meta_file(MetaFile, P),
  485    P \= url(_).
  486
  487                 /*******************************
  488                 *           MESSAGES           *
  489                 *******************************/
  490
  491:- multifile
  492    prolog:message//1.  493
  494prolog:message(url_cache(no_file(File, _MetaFile, URL))) -->
  495    [ 'URL Cache: file ~q does not exist (URL=~q)'-[File, URL] ]