LifeV
link_check.cpp
Go to the documentation of this file.
1 // link_check implementation -----------------------------------------------//
2 
3 // Copyright Beman Dawes 2002.
4 // Distributed under the Boost Software License, Version 1.0.
5 // (See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt)
7 
8 #include "link_check.hpp"
9 #include <boost/regex.hpp>
10 #include <boost/filesystem/operations.hpp>
11 #include <boost/filesystem/exception.hpp>
12 
13 namespace fs = boost::filesystem;
14 
15 namespace
16 {
17 boost::regex url_regex (
18  "<\\s*[^>]*\\s+(?:HREF|SRC)" // HREF or SRC
19  "\\s*=\\s*\"([^\"]*)\"",
21 
22 } // unnamed namespace
23 
24 namespace boost
25 {
26 namespace inspect
27 {
28 
29 // link_check constructor --------------------------------------------------//
30 
34 {
35 }
36 
37 // inspect (all) -----------------------------------------------------------//
38 
39 void link_check::inspect (
40  const string& library_name,
41  const path& full_path )
42 {
43  // keep track of paths already encountered to reduce disk activity
44  if ( !fs::is_directory ( full_path ) )
45  {
46  m_paths[ relative_to ( full_path, fs::initial_path() ) ] |= m_present;
47  }
48 }
49 
50 // inspect ( .htm, .html ) -------------------------------------------------//
51 
52 void link_check::inspect (
53  const string& library_name,
54  const path& full_path, // example: c:/foo/boost/filesystem/path.hpp
55  const string& contents ) // contents of file to be inspected
56 {
57  string::const_iterator start ( contents.begin() );
58  string::const_iterator end ( contents.end() );
59  boost::match_results< string::const_iterator > what;
60  boost::match_flag_type flags = boost::match_default;
61 
62  while ( boost::regex_search ( start, end, what, url_regex, flags) )
63  {
64  // what[0] contains the whole string iterators.
65  // what[1] contains the URL iterators.
66  do_url ( string ( what[1].first, what[1].second ),
67  library_name, full_path );
68 
69  start = what[0].second; // update search position
70  flags |= boost::match_prev_avail; // update flags
71  flags |= boost::match_not_bob;
72  }
73 }
74 
75 // do_url ------------------------------------------------------------------//
76 
77 void link_check::do_url ( const string& url, const string& library_name,
78  const path& source_path ) // precondition: source_path.is_complete()
79 {
80  if ( url[0] == '#'
81  || url.find ( "mailto:" ) == 0
82  || url.find ( "http:" ) == 0
83  || url.find ( "https:" ) == 0
84  || url.find ( "ftp:" ) == 0
85  || url.find ( "news:" ) == 0
86  || url.find ( "javascript:" ) == 0
87  )
88  {
89  return;
90  }
91 
92  if ( url.find ( "file:" ) == 0 )
93  {
95  error ( library_name, source_path, "invalid URL (hardwired file): " + url );
96  return;
97  }
98 
99  // detect characters banned by RFC2396:
100  if ( url.find_first_of ( " <>\"{}|\\^[]'" ) != string::npos )
101  {
103  error ( library_name, source_path, "invalid character in URL: " + url );
104  }
105 
106  // strip url of bookmarks
107  string plain_url ( url );
108  string::size_type pos ( plain_url.find ( '#' ) );
109  if ( pos != string::npos )
110  {
111  plain_url.erase ( pos );
112  // detect characters banned by RFC2396 in bookmark:
113  if ( url.find ( '#', pos + 1 ) != string::npos )
114  {
116  error ( library_name, source_path, "invalid bookmark: " + url );
117  }
118  }
119 
120  // strip url of references to current dir
121  if ( plain_url[0] == '.' && plain_url[1] == '/' )
122  {
123  plain_url.erase ( 0, 2 );
124  }
125 
126  // url is relative source_path.branch()
127  // convert to target_path, which is_complete()
128  path target_path;
129  try
130  {
131  target_path = source_path.branch_path() /= path ( plain_url, fs::no_check );
132  }
133  catch ( const fs::filesystem_error& )
134  {
136  error ( library_name, source_path, "invalid URL: " + url );
137  return;
138  }
139 
140  // create a m_paths entry if necessary
141  std::pair< const string, int > entry (
142  relative_to ( target_path, fs::initial_path() ), 0 );
143  m_path_map::iterator itr ( m_paths.find ( entry.first ) );
144  if ( itr == m_paths.end() )
145  {
146  if ( fs::exists ( target_path ) )
147  {
148  entry.second = m_present;
149  }
150  itr = m_paths.insert ( entry ).first;
151  }
152 
153  // itr now points to the m_paths entry
154  itr->second |= m_linked_to;
155 
156  // if target isn't present, the link is broken
157  if ( (itr->second & m_present) == 0 )
158  {
159  ++m_broken_errors;
160  error ( library_name, source_path, "broken link: " + url );
161  }
162 }
163 
164 // close -------------------------------------------------------------------//
165 
167 {
168  for ( m_path_map::const_iterator itr = m_paths.begin();
169  itr != m_paths.end(); ++itr )
170  {
171  // std::clog << itr->first << " " << itr->second << "\n";
172  if ( (itr->second & m_linked_to) != m_linked_to
173  && (itr->first.rfind ( ".html" ) == itr->first.size() - 5
174  || itr->first.rfind ( ".htm" ) == itr->first.size() - 4)
175  // because they may be redirectors, it is OK if these are unlinked:
176  && itr->first.rfind ( "index.html" ) == string::npos
177  && itr->first.rfind ( "index.htm" ) == string::npos )
178  {
179  ++m_unlinked_errors;
180  path full_path ( fs::initial_path() / path (itr->first, fs::no_check) );
181  error ( impute_library ( full_path ), full_path, "unlinked file" );
182  }
183  }
184 }
185 
186 } // namespace inspect
187 } // namespace boost