xmlStrlen is 2 - 30 times slower than glibc strlen
Given the simplest benchmark, xmlStrlen
is two times slower than strlen
from glibc
on small strings, ten times slower on average strings, and thirty times slower on big strings and an entire HTML file.
The performance of xmlStrlen
resulted in the 0.7 - 1.5 seconds to parse and search through 2 MB HTML document by using Nokogiri. I described this issue in the Nokogiri reposirory which led to 2x speedup on the Nokogiri side, but xmlStrlen
still could be faster.
libxml2
version
$ xml2-config --version
2.9.10
Benchmark results
$ ./slow_parsing_benchmark
xmlStrlen (entire HTML file): 926171.936981 μs
glibc_xmlStrlen (entire HTML file): 36905.903992 μs
delta (xmlStrlen ÷ glibc_xmlStrlen): 25.094584 times
xmlStrlen (average string): 57479.204010 μs
glibc_xmlStrlen (average string): 5802.069000 μs
delta (xmlStrlen ÷ glibc_xmlStrlen): 9.905937 times
xmlStrlen (bigger string): 388056.315979 μs
glibc_xmlStrlen (bigger string): 12797.856995 μs
delta (xmlStrlen ÷ glibc_xmlStrlen): 30.318382 times
xmlStrlen (smallest string): 15870.046021 μs
glibc_xmlStrlen (smallest string): 6282.208984 μs
delta (xmlStrlen ÷ glibc_xmlStrlen): 2.527903 times
Benchmark code
big_html.zip I've used in the code below.
#include <string.h>
#include <stdio.h>
#include <time.h>
#include "libxml/HTMLtree.h"
#include "libxml/xpath.h"
double time_in_microsec()
{
struct timespec tv;
clock_gettime(CLOCK_MONOTONIC_RAW, &tv);
return (double) (tv.tv_sec * 1000000 + tv.tv_nsec / 1000.0);
}
double elapsed_ms_since(double start)
{
return time_in_microsec() - start;
}
int glibc_xmlStrlen(const xmlChar *str) {
if (str == NULL) return(0);
return strlen((const char*)str);
}
int main(int argc, char *argv[])
{
FILE *fp = fopen ("big_html.html", "rb");
char *buffer = NULL;
size_t len;
ssize_t bytes_read = getdelim(&buffer, &len, '\0', fp);
const int N = 1000000;
double start_time, prev_start_time;
prev_start_time = start_time;
start_time = time_in_microsec();
for (int j = 0; j < 1000; j++) {
xmlStrlen(buffer);
}
prev_start_time = start_time;
start_time = time_in_microsec();
for (int j = 0; j < 1000; j++) {
glibc_xmlStrlen(buffer);
}
fprintf(stderr, "xmlStrlen (entire HTML file): %f μs\n", elapsed_ms_since(prev_start_time));
fprintf(stderr, "glibc_xmlStrlen (entire HTML file): %f μs\n", elapsed_ms_since(start_time));
fprintf(stderr, "delta (xmlStrlen ÷ glibc_xmlStrlen): %f times\n\n", elapsed_ms_since(prev_start_time) / elapsed_ms_since(start_time));
xmlChar *str = ".//*[contains(concat(' ', normalize-space(@class), ' '), ' p7n7Ze ') and not(.//a)]";
prev_start_time = start_time;
start_time = time_in_microsec();
for (int j = 0; j < N; j++) {
xmlStrlen(str);
}
prev_start_time = start_time;
start_time = time_in_microsec();
for (int j = 0; j < N; j++) {
glibc_xmlStrlen(str);
}
fprintf(stderr, "xmlStrlen (average string): %f μs\n", elapsed_ms_since(prev_start_time));
fprintf(stderr, "glibc_xmlStrlen (average string): %f μs\n", elapsed_ms_since(start_time));
fprintf(stderr, "delta (xmlStrlen ÷ glibc_xmlStrlen): %f times\n\n", elapsed_ms_since(prev_start_time) / elapsed_ms_since(start_time));
str = ".//*[contains(concat(' ', normalize-space(@class), ' '), ' na4ICd ') and not(contains(concat(' ', normalize-space(@class), ' '), ' rOwove ')) and not(contains(concat(' ', normalize-space(@class), ' '), ' TxCHDf '))][position() = 1] | .//*[contains(concat(' ', normalize-space(@class), ' '), ' hBUZL ') and not(contains(concat(' ', normalize-space(@class), ' '), ' Rv2Cae '))][position() = 2] | .//*[contains(concat(' ', normalize-space(@class), ' '), ' hBUZL ') and not(contains(concat(' ', normalize-space(@class), ' '), ' Rv2Cae ')) and not(contains(concat(' ', normalize-space(@class), ' '), ' Fxxvzc ')) and not(.//span) and not(.//div)] | .//*[contains(concat(' ', normalize-space(@class), ' '), ' dWRflb ')] | .//*[contains(concat(' ', normalize-space(@class), ' '), ' p7n7Ze ') and not(.//a)]";
prev_start_time = start_time;
start_time = time_in_microsec();
for (int j = 0; j < N; j++) {
xmlStrlen(str);
}
prev_start_time = start_time;
start_time = time_in_microsec();
for (int j = 0; j < N; j++) {
glibc_xmlStrlen(str);
}
fprintf(stderr, "xmlStrlen (bigger string): %f μs\n", elapsed_ms_since(prev_start_time));
fprintf(stderr, "glibc_xmlStrlen (bigger string): %f μs\n", elapsed_ms_since(start_time));
fprintf(stderr, "delta (xmlStrlen ÷ glibc_xmlStrlen): %f times\n\n", elapsed_ms_since(prev_start_time) / elapsed_ms_since(start_time));
str = ".//*[not(.//a)]";
prev_start_time = start_time;
start_time = time_in_microsec();
for (int j = 0; j < N; j++) {
xmlStrlen(str);
}
prev_start_time = start_time;
start_time = time_in_microsec();
for (int j = 0; j < N; j++) {
glibc_xmlStrlen(str);
}
fprintf(stderr, "xmlStrlen (smallest string): %f μs\n", elapsed_ms_since(prev_start_time));
fprintf(stderr, "glibc_xmlStrlen (smallest string): %f μs\n", elapsed_ms_since(start_time));
fprintf(stderr, "delta (xmlStrlen ÷ glibc_xmlStrlen): %f times\n\n", elapsed_ms_since(prev_start_time) / elapsed_ms_since(start_time));
}
Makefile
CFLAGS=$(shell xml2-config --cflags)
LDFLAGS=$(shell xml2-config --libs)
slow_parsing_benchmark: slow_parsing_benchmark.o
$(CC) -O2 -o slow_parsing_benchmark slow_parsing_benchmark.o $(LDFLAGS)
slow_parsing_benchmark.o: slow_parsing_benchmark.c
$(CC) $(CFLAGS) -c -o slow_parsing_benchmark.o slow_parsing_benchmark.c
clean:
rm -f slow_parsing_benchmark.o slow_parsing_benchmark slow_parsing_benchmark_v2
The naive approach to simply reuse strlen
in xmlStrlen
speeds up our document parsing and searching from 1.4 seconds to about 700 ms.
diff --git a/xmlstring.c b/xmlstring.c
index e8a1e45d..df247dff 100644
--- a/xmlstring.c
+++ b/xmlstring.c
@@ -423,14 +423,9 @@ xmlStrsub(const xmlChar *str, int start, int len) {
int
xmlStrlen(const xmlChar *str) {
- int len = 0;
-
if (str == NULL) return(0);
- while (*str != 0) { /* non input consuming */
- str++;
- len++;
- }
- return(len);
+
+ return strlen((const char*)str);
}
/**
I'd be happy to open a PR. What do you think about speeding up xmlStrlen
?
Edited by Ilya Zub