Commit f933c898 authored by Daniel Veillard's avatar Daniel Veillard
Browse files

Keep non-significant blanks node in HTML parser

For https://bugzilla.gnome.org/show_bug.cgi?id=681822

Regardless if the option HTML_PARSE_NOBLANKS is set or not, blank nodes
are removed from a HTML document, for example:

<html>
  <head>
    <title>This is a test.</title>
  </head>
  <body>
    <p>This is a test.</p>
  </body>
</html>

is read as:

<html><head><title>This is a test.</title></head><body>
    <p>This is a test.</p>
  </body></html>

This changes the default behaviour but the old behaviour is available
as expected when using the parser flag HTML_PARSE_NOBLANKS

Based on original patch from Igor Ignatyuk <igor_ignatiouk@hotmail.com>

* HTMLparser.c: change various places in the parser where ignorable_space
  SAX callback was called without checking for the parser flag preference
* xmllint.c: make sure we use the new flag even for HTML parsing
* result/HTML/*: this modifies the output of a number of tests
parent 878ec9db
......@@ -2981,9 +2981,14 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
*/
if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
if (areBlanks(ctxt, buf, nbchar)) {
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(ctxt->userData,
buf, nbchar);
if (ctxt->keepBlanks) {
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData, buf, nbchar);
} else {
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(ctxt->userData,
buf, nbchar);
}
} else {
htmlCheckParagraph(ctxt);
if (ctxt->sax->characters != NULL)
......@@ -3014,8 +3019,14 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
*/
if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
if (areBlanks(ctxt, buf, nbchar)) {
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
if (ctxt->keepBlanks) {
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData, buf, nbchar);
} else {
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(ctxt->userData,
buf, nbchar);
}
} else {
htmlCheckParagraph(ctxt);
if (ctxt->sax->characters != NULL)
......@@ -5687,9 +5698,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
if ((cur != '<') && (cur != '&')) {
if (ctxt->sax != NULL) {
if (IS_BLANK_CH(cur)) {
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(
ctxt->userData, &cur, 1);
if (ctxt->keepBlanks) {
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(
ctxt->userData, &cur, 1);
} else {
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(
ctxt->userData, &cur, 1);
}
} else {
htmlCheckParagraph(ctxt);
if (ctxt->sax->characters != NULL)
......
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><head><style>
<html>
<head>
<style>
.......
...............................................................
...............................................................
......@@ -63,4 +65,6 @@
...............................................................
...............................................................
...............................................................
</style></head></html>
</style>
</head>
</html>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<head><title>This service is temporary down</title></head>
<head>
<title>This service is temporary down</title>
</head>
<body bgcolor="#FFFFFF">
<h1 align="center">Sorry, this service is temporary down</h1>
We are doing our best to get it back on-line,
......
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><body bgcolor="#FFFFFF">
<a href="mailto:katherine@cbfanc.org,website@bis.doc.gov?subject=South%20San%20Francisco%20BIS%20Seminar%20-%20October%2016th"></a><br>
</body></html>
<html>
<body bgcolor="#FFFFFF">
<a href="mailto:katherine@cbfanc.org,website@bis.doc.gov?subject=South%20San%20Francisco%20BIS%20Seminar%20-%20October%2016th"></a><br>
</body>
</html>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><body><hr></body></html>
<html><body>
<hr>
</body></html>
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/strict.dtd">
<html>
<head><title>gnome-xml push mode bug</title></head>
<head>
<title>gnome-xml push mode bug</title>
</head>
<body>
<table border="4"><tr>
<td bgcolor="white">
<table border="4">
<tr>
<td bgcolor="white">
Foo1
<table border="4"><tr><td>Foo2<p></p>
<table border="4">
<tr>
<td>Foo2<p></p>
<p></p>
</td></tr></table>
</td>
</tr>
</table>
</td>
<td bgcolor="blue">Foo3</td>
</tr></table>
</body>
</tr>
</table>
</body>
</html>
......@@ -7,7 +7,9 @@
function NS_NullWindow(){this.window;}
function NS_NewOpen(url,nam,atr){return(new NS_NullWindow());}
window.open=NS_NewOpen;
</script><!-- END Naviscope Javascript --><!-- saved from url=(0027)http://www.agents-tech.com/ --><meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type">
</script>
<!-- END Naviscope Javascript --><!-- saved from url=(0027)http://www.agents-tech.com/ -->
<meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type">
<meta content="Copernic.com Inc. develops innovative agent technology solutions to efficiently access and manage the overwhelming quantity of information available on the Internet and intranets." name="DESCRIPTION">
<meta content="agent,technology,intranet,extranet,management,filtering,ranking,solution,service,intelligent,intelligence,client,server,architecture,developer,development,information,telecommunication,announcement,press,product,profile,contact,multi-agent,meta-search,metasearch,multi-thread,mobile,wireless,shopping,robot,PCS,Copernic,engine,toolkit,CDK,EDK" name="KEYWORDS">
<meta content="MSHTML 5.00.3103.1000" name="GENERATOR">
......
This diff is collapsed.
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><body><p tst="a&amp;b" tst2="a&amp;b" tst3="a &amp; b">
<html><body>
<p tst="a&amp;b" tst2="a&amp;b" tst3="a &amp; b">
a&amp;b
a&amp;b
a &amp; b
</p></body></html>
</p>
</body></html>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><body>
<html>
<body>
<form>
<input type="text" name="test" value="&scaron;">
</form>
</body></html>
</body>
</html>
<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">
<html>
<head>
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
<title>README - Microsoft FrontPage 2000 Server Extensions</title>
<meta name="Microsoft Theme" content="none">
</head>
<body>
<font face="Verdana">
<h1><a name="top">Microsoft FrontPage 2000 Server Extensions, UNIX</a></h1>
......@@ -16,13 +18,16 @@
</p>
<ul>
<li>Authoring FrontPage webs</li>
<li>Authoring FrontPage webs</li>
<li>Administering FrontPage webs</li>
<li>Browse-time FrontPage web functionality</li>
</ul>
<h2>Contents&nbsp;</h2>
<a href="#relnotes">Release Notes</a><br><a href="#moreinfo">Resources for More Information</a>
<a href="#relnotes">Release Notes</a><br>
<a href="#moreinfo">Resources for More Information</a>
<p>&nbsp;</p>
<hr>
<h2><a name="relnotes">Release Notes</a></h2>
......@@ -30,7 +35,9 @@
<p>This section provides complementary or late-breaking
information to supplement the Microsoft FrontPage Server Extensions documentation.</p>
<p><a href="#apache">Apache 1.3.4 Support</a><br><a href="#upgrading">Upgrading from previous version of FrontPage Server Extensions</a><br><a href="#executables">Uploading files into executable folders</a></p>
<p><a href="#apache">Apache 1.3.4 Support</a><br>
<a href="#upgrading">Upgrading from previous version of FrontPage Server Extensions</a><br>
<a href="#executables">Uploading files into executable folders</a></p>
<p align="right"><font size="1"><a href="#top">Top of Page</a></font></p>
......@@ -100,12 +107,17 @@ see the FrontPage 2000 Server Extensions Resource Kit at <a href="http://officeu
<hr>
<h2><a name="moreinfo">Resources for More Information</a></h2>
<p>This section lists sources of more information about the
FrontPage Server Extensions.</p>
<p><a href="#serk">Server Extensions Resource Kit</a><br><a href="#serkupdate">Server Extensions Resource Kit Update</a><br><a href="#kb">Knowledge Base</a></p>
<p><a href="#serk">Server Extensions Resource Kit</a><br>
<a href="#serkupdate">Server Extensions Resource Kit Update</a><br>
<a href="#kb">Knowledge Base</a></p>
<p align="right"><font size="1"><a href="#top">Top of Page</a></font></p>
......
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<head><meta charset="iso-8859-1"></head>
<head>
<meta charset="iso-8859-1">
</head>
<body>
<p>tr&egrave;s</p>
</body>
......
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<head><title></title></head>
<head>
<title></title>
</head>
<body>
<ul>
<li>First item
......@@ -8,5 +10,6 @@
<li>Second item, closes the first one
</li>
</ul>
</body>
</html>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><head><meta name="Author" content="Root &lt;root@aol.com&gt;"></head></html>
<html>
<head>
<meta name="Author" content="Root &lt;root@aol.com&gt;">
</head>
</html>
<!DOCTYPE html>
<html>
<head>
<title>omg</title>
<noscript><link rel="stylesheet" href="http://foo.com"></noscript>
</head>
<body id="xxx">
<head>
<title>omg</title>
<noscript><link rel="stylesheet" href="http://foo.com"></noscript>
</head>
<body id="xxx">
<p>yo</p>
</body>
</html>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><body>
<html>
<body>
<pre><a href="toto"></a><img src="titi"></pre>
</body></html>
</body>
</html>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<?xml-stylesheet href="./css/ht2html.css" type="text/css"?><html>
<!-- THIS PAGE IS AUTOMATICALLY GENERATED. DO NOT EDIT. --><head><title>Python Programming Language</title></head>
<!-- THIS PAGE IS AUTOMATICALLY GENERATED. DO NOT EDIT. -->
<head>
<title>Python Programming Language</title>
</head>
<body>
</body>
</html>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<head><title>Regression test 1</title></head>
<head>
<title>Regression test 1</title>
</head>
<body>
<h1>Regression test 1</h1>
<p>
......
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<head><title>Regression test 2</title></head>
<head>
<title>Regression test 2</title>
</head>
<body>
<h1>Regression test 2</h1>
<p>
......
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<head><title>Regression test 3</title></head>
<head>
<title>Regression test 3</title>
</head>
<body>
<h1>Regression test 3</h1>
<p>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment