Dumping ASCII encoded XML will drop characters randomly
While I was working with XML files with Unicode characters (actually using the python lxml
wrapper), I discovered that dumping the XML file with ASCII encoding (which is the default in lxml's tostring
) will drop some element's text characters in random situations.
I was able to reproduce the problem, with the latest commit dea91c97 of libxml2 with bug2.xml file (which is obfuscated due to confidential data):
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <libxml/tree.h>
#include <libxml/parser.h>
#include <libxml/xmlsave.h>
#include <libxml/xpath.h>
void print_length(xmlDocPtr doc) {
xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
xmlXPathObjectPtr xpathObj = xmlXPathEvalExpression("//instruction[@idid=\"60\"]/description/text()", xpathCtx);
xmlNodePtr node = xpathObj->nodesetval->nodeTab[0];
printf("%s\ntext length: %ld\n\n", node->content, strlen(node->content));
}
int main(int argc, char **argv) {
xmlDocPtr doc;
doc = xmlParseFile("bug2.xml");
doc->encoding = xmlStrdup("UTF-8");
print_length(doc);
xmlCharEncodingHandler *enchandler = xmlFindCharEncodingHandler("ASCII");
xmlOutputBuffer *buffer = xmlAllocOutputBuffer(enchandler);
xmlNodeDumpOutput(buffer, NULL, doc->children, 0, 0, "ASCII");
xmlOutputBufferFlush(buffer);
xmlChar *result = xmlCharStrdup(xmlBufferContent(buffer->conv));
// printf("%s\n", result); // <= Uncomment and look at the length of the last "description" element
doc = xmlParseDoc(result);
print_length(doc);
xmlOutputBufferClose(buffer);
return 0;
}
Note that the situation is completely random. Just remove some part of the input XML file randomly and most probably the output will be fine in the follow up test.