From 9dc6010da3c10a5a843c40d1f1d1e7156430f7af Mon Sep 17 00:00:00 2001 From: Charlie Savage Date: Mon, 6 Apr 2026 21:56:22 -0700 Subject: [PATCH 01/16] Fix RelaxNG factories to raise on parse failure instead of wrapping NULL. --- ext/libxml/ruby_xml_relaxng.c | 9 +++++++++ test/test_relaxng.rb | 6 ++++++ 2 files changed, 15 insertions(+) diff --git a/ext/libxml/ruby_xml_relaxng.c b/ext/libxml/ruby_xml_relaxng.c index e4d00790..2973036a 100644 --- a/ext/libxml/ruby_xml_relaxng.c +++ b/ext/libxml/ruby_xml_relaxng.c @@ -63,6 +63,9 @@ static VALUE rxml_relaxng_init_from_uri(VALUE class, VALUE uri) xrelaxng = xmlRelaxNGParse(xparser); xmlRelaxNGFreeParserCtxt(xparser); + if (!xrelaxng) + rxml_raise(xmlGetLastError()); + return TypedData_Wrap_Struct(cXMLRelaxNG, &rxml_relaxng_data_type, xrelaxng); } @@ -84,6 +87,9 @@ static VALUE rxml_relaxng_init_from_document(VALUE class, VALUE document) xrelaxng = xmlRelaxNGParse(xparser); xmlRelaxNGFreeParserCtxt(xparser); + if (!xrelaxng) + rxml_raise(xmlGetLastError()); + return TypedData_Wrap_Struct(cXMLRelaxNG, &rxml_relaxng_data_type, xrelaxng); } @@ -104,6 +110,9 @@ static VALUE rxml_relaxng_init_from_string(VALUE self, VALUE relaxng_str) xrelaxng = xmlRelaxNGParse(xparser); xmlRelaxNGFreeParserCtxt(xparser); + if (!xrelaxng) + rxml_raise(xmlGetLastError()); + return TypedData_Wrap_Struct(cXMLRelaxNG, &rxml_relaxng_data_type, xrelaxng); } diff --git a/test/test_relaxng.rb b/test/test_relaxng.rb index 01cb2146..9742681c 100644 --- a/test/test_relaxng.rb +++ b/test/test_relaxng.rb @@ -26,6 +26,12 @@ def test_valid assert(@doc.validate_relaxng(relaxng)) end + def test_parse_failure_raises + assert_raises(LibXML::XML::Error) do + LibXML::XML::RelaxNG.new("") + end + end + def test_invalid new_node = LibXML::XML::Node.new('invalid', 'this will mess up validation') @doc.root << new_node From d66caca400403c5b4bb8eb9104ba07abb4d71e8f Mon Sep 17 00:00:00 2001 From: Charlie Savage Date: Mon, 6 Apr 2026 22:28:44 -0700 Subject: [PATCH 02/16] Clamp read callback to buffer size to prevent overflow. --- .gitignore | 46 ++++++++++++++++++++-------------------- Gemfile | 1 + LICENSE | 40 +++++++++++++++++----------------- Rakefile | 19 ++++++++++++----- ext/libxml/ruby_xml_io.c | 2 ++ libxml-ruby.gemspec | 1 + test/test_xpath.rb | 12 +++++------ 7 files changed, 66 insertions(+), 55 deletions(-) diff --git a/.gitignore b/.gitignore index 27721079..b8d139b2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,23 +1,23 @@ -pkg -nbproject -website/_site -*.swp -*.swo - -/ext/vc/libxml_ruby.sdf -/ext/vc/libxml_ruby_19/Debug -/ext/vc/libxml_ruby_18/Debug -/doc -/tmp -/.idea -/ext/vc/ipch -/ext/vc/libxml_ruby.opensdf -.config -InstalledFiles -ext/libxml/libxml_ruby.bundle -.DS_Store -/ext/vc/*.suo -Gemfile.lock -lib/*/libxml_ruby.so -/ext/vc/libxml_ruby/x64 -/ext/vc/libxml_ruby/libxml_ruby.vcxproj.user +pkg +nbproject +website/_site +*.swp +*.swo + +/ext/vc/libxml_ruby.sdf +/ext/vc/libxml_ruby_19/Debug +/ext/vc/libxml_ruby_18/Debug +/doc +/tmp +/.idea +/ext/vc/ipch +/ext/vc/libxml_ruby.opensdf +.config +InstalledFiles +ext/libxml/libxml_ruby.bundle +.DS_Store +/ext/vc/*.suo +Gemfile.lock +lib/*/libxml_ruby.so +/ext/vc/libxml_ruby/x64 +/ext/vc/libxml_ruby/libxml_ruby.vcxproj.user diff --git a/Gemfile b/Gemfile index b42b1ad4..6e35ac6c 100644 --- a/Gemfile +++ b/Gemfile @@ -1,3 +1,4 @@ source "https://www.rubygems.org" gemspec +gem "ruby_memcheck", path: "../ruby_memcheck" diff --git a/LICENSE b/LICENSE index 71fe8d11..373b74ec 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,21 @@ - Copyright (c) 2008-2013 Charlie Savage and contributors - Copyright (c) 2002-2007 Sean Chittenden and contributors - Copyright (c) 2001 Wai-Sun "Squidster" Chia - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + Copyright (c) 2008-2013 Charlie Savage and contributors + Copyright (c) 2002-2007 Sean Chittenden and contributors + Copyright (c) 2001 Wai-Sun "Squidster" Chia + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/Rakefile b/Rakefile index d917ce5f..3f3ba06c 100644 --- a/Rakefile +++ b/Rakefile @@ -3,6 +3,7 @@ require "rubygems" require "rake/extensiontask" require "rake/testtask" +require "ruby_memcheck" require "rubygems/package_task" require "rdoc/task" require "yaml" @@ -15,6 +16,14 @@ spec = Gem::Specification.load("#{GEM_NAME}.gemspec") task :default => [:test] +test_config = lambda do |t| + t.libs << "test" + t.test_files = FileList['test/test*.rb'] - ['test/test_suite.rb'] + t.verbose = true +end + +RubyMemcheck.config(binary_name: SO_NAME) + # Setup compile tasks Rake::ExtensionTask.new do |ext| ext.gem_spec = spec @@ -72,8 +81,8 @@ RDoc::Task.new("rdoc") do |rdoc| end # Test Task -Rake::TestTask.new do |t| - t.libs << "test" - t.test_files = FileList['test/test*.rb'] - ['test/test_suite.rb'] - t.verbose = true -end \ No newline at end of file +Rake::TestTask.new(&test_config) + +namespace :test do + RubyMemcheck::TestTask.new(valgrind: :compile, &test_config) +end diff --git a/ext/libxml/ruby_xml_io.c b/ext/libxml/ruby_xml_io.c index fabf9220..4b2d5d2f 100644 --- a/ext/libxml/ruby_xml_io.c +++ b/ext/libxml/ruby_xml_io.c @@ -19,6 +19,8 @@ int rxml_read_callback(void *context, char *buffer, int len) return 0; size = RSTRING_LEN(string); + if (size > (size_t)len) + size = (size_t)len; memcpy(buffer, StringValuePtr(string), size); return (int)size; diff --git a/libxml-ruby.gemspec b/libxml-ruby.gemspec index bc87b2a8..a196d5e4 100644 --- a/libxml-ruby.gemspec +++ b/libxml-ruby.gemspec @@ -47,5 +47,6 @@ Gem::Specification.new do |spec| spec.add_development_dependency('rake-compiler') spec.add_development_dependency('minitest') spec.add_development_dependency('rdoc') + spec.add_development_dependency('ruby_memcheck') spec.license = 'MIT' end diff --git a/test/test_xpath.rb b/test/test_xpath.rb index 75aec3d6..1daef685 100644 --- a/test/test_xpath.rb +++ b/test/test_xpath.rb @@ -5,11 +5,13 @@ class TestXPath < Minitest::Test def setup + GC.stress = true @doc = LibXML::XML::Document.file(File.join(File.dirname(__FILE__), 'model/soap.xml')) end def teardown @doc = nil + GC.stress = false end def test_doc_find @@ -30,15 +32,11 @@ def test_ns end def test_ns_gc - _stress = GC.stress - GC.stress = true - doc = LibXML::XML::Document.string('') node = doc.root + doc = nil # This line segfaults on prior versions of libxml-ruby node.find("namespace::*") - - GC.stress = _stress end def test_ns_array @@ -156,11 +154,11 @@ def test_memory # is free, it iterates over its results which are pointers # to the document's nodes. A segmentation fault then happens. - 1000.times do + 10.times do doc = LibXML::XML::Document.new('1.0') doc.root = LibXML::XML::Node.new("header") - 1000.times do + 10.times do doc.root << LibXML::XML::Node.new("footer") end From 0931b78ca16a71cb78abc0ce6e5a59c0568ead51 Mon Sep 17 00:00:00 2001 From: Charlie Savage Date: Mon, 6 Apr 2026 22:32:35 -0700 Subject: [PATCH 03/16] Fix Schema @version reading name instead of version. --- ext/libxml/ruby_xml_schema.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/libxml/ruby_xml_schema.c b/ext/libxml/ruby_xml_schema.c index 5c43ea51..4430897c 100644 --- a/ext/libxml/ruby_xml_schema.c +++ b/ext/libxml/ruby_xml_schema.c @@ -137,7 +137,7 @@ VALUE rxml_wrap_schema(xmlSchemaPtr xschema) rb_iv_set(result, "@target_namespace", QNIL_OR_STRING(xschema->targetNamespace)); rb_iv_set(result, "@name", QNIL_OR_STRING(xschema->name)); rb_iv_set(result, "@id", QNIL_OR_STRING(xschema->id)); - rb_iv_set(result, "@version", QNIL_OR_STRING(xschema->name)); + rb_iv_set(result, "@version", QNIL_OR_STRING(xschema->version)); return result; } From 77557f021e4e0405ecc42eb5f56991a805278cfc Mon Sep 17 00:00:00 2001 From: Charlie Savage Date: Mon, 6 Apr 2026 22:37:01 -0700 Subject: [PATCH 04/16] Remove no-op free functions from schema wrapper types. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These objects are owned by the parent xmlSchemaPtr and freed when the schema is freed. The free functions were nulling a local variable then calling xmlFree(NULL) — a no-op since their introduction in 2012. --- ext/libxml/ruby_xml_schema_attribute.c | 9 +-------- ext/libxml/ruby_xml_schema_element.c | 9 +-------- ext/libxml/ruby_xml_schema_facet.c | 9 +-------- ext/libxml/ruby_xml_schema_type.c | 9 +-------- 4 files changed, 4 insertions(+), 32 deletions(-) diff --git a/ext/libxml/ruby_xml_schema_attribute.c b/ext/libxml/ruby_xml_schema_attribute.c index 298d5cb9..db6134ce 100644 --- a/ext/libxml/ruby_xml_schema_attribute.c +++ b/ext/libxml/ruby_xml_schema_attribute.c @@ -50,16 +50,9 @@ struct _xmlSchemaAttributeUseProhib { VALUE cXMLSchemaAttribute; -static void rxml_schema_attribute_free(void *data) -{ - xmlSchemaAttributeUsePtr attr = (xmlSchemaAttributeUsePtr)data; - attr = NULL; - xmlFree(attr); -} - static const rb_data_type_t rxml_schema_attribute_type = { "XML::Schema::Attribute", - {NULL, rxml_schema_attribute_free, NULL}, + {NULL, NULL, NULL}, NULL, NULL, 0 }; diff --git a/ext/libxml/ruby_xml_schema_element.c b/ext/libxml/ruby_xml_schema_element.c index aa9022d4..bc7806d3 100644 --- a/ext/libxml/ruby_xml_schema_element.c +++ b/ext/libxml/ruby_xml_schema_element.c @@ -4,16 +4,9 @@ VALUE cXMLSchemaElement; -static void rxml_schema_element_free(void *data) -{ - xmlSchemaElementPtr xschema_element = (xmlSchemaElementPtr)data; - xschema_element = NULL; - xmlFree(xschema_element); -} - static const rb_data_type_t rxml_schema_element_type = { "XML::Schema::Element", - {NULL, rxml_schema_element_free, NULL}, + {NULL, NULL, NULL}, NULL, NULL, 0 }; diff --git a/ext/libxml/ruby_xml_schema_facet.c b/ext/libxml/ruby_xml_schema_facet.c index 9f44fc44..5cb0f3a1 100644 --- a/ext/libxml/ruby_xml_schema_facet.c +++ b/ext/libxml/ruby_xml_schema_facet.c @@ -7,16 +7,9 @@ VALUE cXMLSchemaFacet; -static void rxml_schema_facet_free(void *data) -{ - xmlSchemaFacetPtr facet = (xmlSchemaFacetPtr)data; - facet = NULL; - xmlFree(facet); -} - static const rb_data_type_t rxml_schema_facet_type = { "XML::Schema::Facet", - {NULL, rxml_schema_facet_free, NULL}, + {NULL, NULL, NULL}, NULL, NULL, 0 }; diff --git a/ext/libxml/ruby_xml_schema_type.c b/ext/libxml/ruby_xml_schema_type.c index 45453d2c..d26ee0e9 100644 --- a/ext/libxml/ruby_xml_schema_type.c +++ b/ext/libxml/ruby_xml_schema_type.c @@ -46,16 +46,9 @@ a group definition, a XML_SCHEMA_EXTRA_QNAMEREF (if a reference), VALUE cXMLSchemaType; -static void rxml_schema_type_free(void *data) -{ - xmlSchemaTypePtr xschema_type = (xmlSchemaTypePtr)data; - xschema_type = NULL; - xmlFree(xschema_type); -} - static const rb_data_type_t rxml_schema_type_type = { "XML::Schema::Type", - {NULL, rxml_schema_type_free, NULL}, + {NULL, NULL, NULL}, NULL, NULL, 0 }; From 86da5a33cf55a2146dc72d99af87991ef65450bb Mon Sep 17 00:00:00 2001 From: Charlie Savage Date: Mon, 6 Apr 2026 22:38:56 -0700 Subject: [PATCH 05/16] Fix incorrect types in TypedData_Get_Struct for node_type accessors. --- ext/libxml/ruby_xml_attr_decl.c | 4 ++-- ext/libxml/ruby_xml_document.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ext/libxml/ruby_xml_attr_decl.c b/ext/libxml/ruby_xml_attr_decl.c index 8f20395f..9e3eeeef 100644 --- a/ext/libxml/ruby_xml_attr_decl.c +++ b/ext/libxml/ruby_xml_attr_decl.c @@ -88,8 +88,8 @@ static VALUE rxml_attr_decl_next_get(VALUE self) */ static VALUE rxml_attr_decl_node_type(VALUE self) { - xmlAttrPtr xattr; - TypedData_Get_Struct(self, xmlAttr, &rxml_attr_decl_type, xattr); + xmlAttributePtr xattr; + TypedData_Get_Struct(self, xmlAttribute, &rxml_attr_decl_type, xattr); return INT2NUM(xattr->type); } diff --git a/ext/libxml/ruby_xml_document.c b/ext/libxml/ruby_xml_document.c index 9038bc23..2d33a1ee 100644 --- a/ext/libxml/ruby_xml_document.c +++ b/ext/libxml/ruby_xml_document.c @@ -632,9 +632,9 @@ static VALUE rxml_document_next_q(VALUE self) */ static VALUE rxml_document_node_type(VALUE self) { - xmlNodePtr xnode; - TypedData_Get_Struct(self, xmlNode, &rxml_document_data_type, xnode); - return (INT2NUM(xnode->type)); + xmlDocPtr xdoc; + TypedData_Get_Struct(self, xmlDoc, &rxml_document_data_type, xdoc); + return (INT2NUM(xdoc->type)); } /* From 30dc3d342cde349987cc375881a1c156e625470f Mon Sep 17 00:00:00 2001 From: Charlie Savage Date: Mon, 6 Apr 2026 22:45:08 -0700 Subject: [PATCH 06/16] Revert "Fix incorrect types in TypedData_Get_Struct for node_type accessors." This reverts commit 86da5a33cf55a2146dc72d99af87991ef65450bb. --- ext/libxml/ruby_xml_attr_decl.c | 4 ++-- ext/libxml/ruby_xml_document.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ext/libxml/ruby_xml_attr_decl.c b/ext/libxml/ruby_xml_attr_decl.c index 9e3eeeef..8f20395f 100644 --- a/ext/libxml/ruby_xml_attr_decl.c +++ b/ext/libxml/ruby_xml_attr_decl.c @@ -88,8 +88,8 @@ static VALUE rxml_attr_decl_next_get(VALUE self) */ static VALUE rxml_attr_decl_node_type(VALUE self) { - xmlAttributePtr xattr; - TypedData_Get_Struct(self, xmlAttribute, &rxml_attr_decl_type, xattr); + xmlAttrPtr xattr; + TypedData_Get_Struct(self, xmlAttr, &rxml_attr_decl_type, xattr); return INT2NUM(xattr->type); } diff --git a/ext/libxml/ruby_xml_document.c b/ext/libxml/ruby_xml_document.c index 2d33a1ee..9038bc23 100644 --- a/ext/libxml/ruby_xml_document.c +++ b/ext/libxml/ruby_xml_document.c @@ -632,9 +632,9 @@ static VALUE rxml_document_next_q(VALUE self) */ static VALUE rxml_document_node_type(VALUE self) { - xmlDocPtr xdoc; - TypedData_Get_Struct(self, xmlDoc, &rxml_document_data_type, xdoc); - return (INT2NUM(xdoc->type)); + xmlNodePtr xnode; + TypedData_Get_Struct(self, xmlNode, &rxml_document_data_type, xnode); + return (INT2NUM(xnode->type)); } /* From 9f17f7f77beadd08b63a02a04512784faea6dbfb Mon Sep 17 00:00:00 2001 From: Charlie Savage Date: Mon, 6 Apr 2026 22:48:12 -0700 Subject: [PATCH 07/16] Fix incorrect types in TypedData_Get_Struct for node_type accessors. attr_decl used xmlAttr instead of xmlAttribute, document used xmlNode instead of xmlDoc. Both worked by accident because libxml2 places the type field at the same offset in all node-like structs. --- ext/libxml/ruby_xml_attr_decl.c | 4 ++-- ext/libxml/ruby_xml_document.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ext/libxml/ruby_xml_attr_decl.c b/ext/libxml/ruby_xml_attr_decl.c index 8f20395f..9e3eeeef 100644 --- a/ext/libxml/ruby_xml_attr_decl.c +++ b/ext/libxml/ruby_xml_attr_decl.c @@ -88,8 +88,8 @@ static VALUE rxml_attr_decl_next_get(VALUE self) */ static VALUE rxml_attr_decl_node_type(VALUE self) { - xmlAttrPtr xattr; - TypedData_Get_Struct(self, xmlAttr, &rxml_attr_decl_type, xattr); + xmlAttributePtr xattr; + TypedData_Get_Struct(self, xmlAttribute, &rxml_attr_decl_type, xattr); return INT2NUM(xattr->type); } diff --git a/ext/libxml/ruby_xml_document.c b/ext/libxml/ruby_xml_document.c index 9038bc23..2d33a1ee 100644 --- a/ext/libxml/ruby_xml_document.c +++ b/ext/libxml/ruby_xml_document.c @@ -632,9 +632,9 @@ static VALUE rxml_document_next_q(VALUE self) */ static VALUE rxml_document_node_type(VALUE self) { - xmlNodePtr xnode; - TypedData_Get_Struct(self, xmlNode, &rxml_document_data_type, xnode); - return (INT2NUM(xnode->type)); + xmlDocPtr xdoc; + TypedData_Get_Struct(self, xmlDoc, &rxml_document_data_type, xdoc); + return (INT2NUM(xdoc->type)); } /* From 263cff6f6d14af370205e2a15654f1f8922c33fa Mon Sep 17 00:00:00 2001 From: Charlie Savage Date: Mon, 6 Apr 2026 22:49:33 -0700 Subject: [PATCH 08/16] Remove dead Check_Type after NUM2INT in HTML parser context options. --- ext/libxml/ruby_xml_html_parser_context.c | 1 - 1 file changed, 1 deletion(-) diff --git a/ext/libxml/ruby_xml_html_parser_context.c b/ext/libxml/ruby_xml_html_parser_context.c index d7d65c5b..28bbed1a 100644 --- a/ext/libxml/ruby_xml_html_parser_context.c +++ b/ext/libxml/ruby_xml_html_parser_context.c @@ -327,7 +327,6 @@ static VALUE rxml_html_parser_context_options_set(VALUE self, VALUE options) { int xml_options = NUM2INT(options); htmlParserCtxtPtr ctxt; - Check_Type(options, T_FIXNUM); TypedData_Get_Struct(self, htmlParserCtxt, &rxml_html_parser_context_type, ctxt); htmlCtxtUseOptions(ctxt, xml_options); From 2104be5393538e4710b96998d981489f89b6ab1c Mon Sep 17 00:00:00 2001 From: Charlie Savage Date: Mon, 6 Apr 2026 23:02:33 -0700 Subject: [PATCH 09/16] Use rb_ensure in namespaces.each to free nsList on exception. --- ext/libxml/ruby_xml_namespaces.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/ext/libxml/ruby_xml_namespaces.c b/ext/libxml/ruby_xml_namespaces.c index 3999b93f..1316896d 100644 --- a/ext/libxml/ruby_xml_namespaces.c +++ b/ext/libxml/ruby_xml_namespaces.c @@ -116,10 +116,31 @@ static VALUE rxml_namespaces_definitions(VALUE self) * .. * end */ +static VALUE rxml_namespaces_each_yield(VALUE data) +{ + xmlNsPtr*nsList = (xmlNsPtr*)data; + xmlNsPtr*xns; + + for (xns = nsList; *xns != NULL; xns++) + { + VALUE ns = rxml_namespace_wrap(*xns); + rb_yield(ns); + } + + return Qnil; +} + +static VALUE rxml_namespaces_free_list(VALUE data) +{ + xmlNsPtr*nsList = (xmlNsPtr*)data; + xmlFree(nsList); + return Qnil; +} + static VALUE rxml_namespaces_each(VALUE self) { xmlNodePtr xnode; - xmlNsPtr *nsList, *xns; + xmlNsPtr*nsList; TypedData_Get_Struct(self, xmlNode, &rxml_namespaces_type, xnode); @@ -128,12 +149,8 @@ static VALUE rxml_namespaces_each(VALUE self) if (nsList == NULL) return (Qnil); - for (xns = nsList; *xns != NULL; xns++) - { - VALUE ns = rxml_namespace_wrap(*xns); - rb_yield(ns); - } - xmlFree(nsList); + rb_ensure(rxml_namespaces_each_yield, (VALUE)nsList, + rxml_namespaces_free_list, (VALUE)nsList); return Qnil; } From 2d7f2d30d259873c7b89fdf8c166b878a84a0a36 Mon Sep 17 00:00:00 2001 From: Charlie Savage Date: Mon, 6 Apr 2026 23:07:51 -0700 Subject: [PATCH 10/16] Fix write callback to use len parameter instead of strlen for StringIO path. --- ext/libxml/ruby_xml_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/libxml/ruby_xml_io.c b/ext/libxml/ruby_xml_io.c index 4b2d5d2f..727453e7 100644 --- a/ext/libxml/ruby_xml_io.c +++ b/ext/libxml/ruby_xml_io.c @@ -32,7 +32,7 @@ int rxml_write_callback(VALUE io, const char *buffer, int len) { // Could be StringIO VALUE written, string; - string = rb_external_str_new_with_enc(buffer, (long)strlen(buffer), rb_enc_get(io)); + string = rb_external_str_new_with_enc(buffer, (long)len, rb_enc_get(io)); written = rb_funcall(io, WRITE_METHOD, 1, string); return NUM2INT(written); } From f649c1e7e23827233ef43e7f649bb4f1c47f01b3 Mon Sep 17 00:00:00 2001 From: Charlie Savage Date: Mon, 6 Apr 2026 23:20:14 -0700 Subject: [PATCH 11/16] Revert accidentally committed local development changes. --- .gitignore | 46 ++++++++++++++++++++++----------------------- Gemfile | 1 - LICENSE | 40 +++++++++++++++++++-------------------- Rakefile | 19 +++++-------------- libxml-ruby.gemspec | 1 - test/test_xpath.rb | 12 +++++++----- 6 files changed, 55 insertions(+), 64 deletions(-) diff --git a/.gitignore b/.gitignore index b8d139b2..27721079 100644 --- a/.gitignore +++ b/.gitignore @@ -1,23 +1,23 @@ -pkg -nbproject -website/_site -*.swp -*.swo - -/ext/vc/libxml_ruby.sdf -/ext/vc/libxml_ruby_19/Debug -/ext/vc/libxml_ruby_18/Debug -/doc -/tmp -/.idea -/ext/vc/ipch -/ext/vc/libxml_ruby.opensdf -.config -InstalledFiles -ext/libxml/libxml_ruby.bundle -.DS_Store -/ext/vc/*.suo -Gemfile.lock -lib/*/libxml_ruby.so -/ext/vc/libxml_ruby/x64 -/ext/vc/libxml_ruby/libxml_ruby.vcxproj.user +pkg +nbproject +website/_site +*.swp +*.swo + +/ext/vc/libxml_ruby.sdf +/ext/vc/libxml_ruby_19/Debug +/ext/vc/libxml_ruby_18/Debug +/doc +/tmp +/.idea +/ext/vc/ipch +/ext/vc/libxml_ruby.opensdf +.config +InstalledFiles +ext/libxml/libxml_ruby.bundle +.DS_Store +/ext/vc/*.suo +Gemfile.lock +lib/*/libxml_ruby.so +/ext/vc/libxml_ruby/x64 +/ext/vc/libxml_ruby/libxml_ruby.vcxproj.user diff --git a/Gemfile b/Gemfile index 6e35ac6c..b42b1ad4 100644 --- a/Gemfile +++ b/Gemfile @@ -1,4 +1,3 @@ source "https://www.rubygems.org" gemspec -gem "ruby_memcheck", path: "../ruby_memcheck" diff --git a/LICENSE b/LICENSE index 373b74ec..71fe8d11 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,21 @@ - Copyright (c) 2008-2013 Charlie Savage and contributors - Copyright (c) 2002-2007 Sean Chittenden and contributors - Copyright (c) 2001 Wai-Sun "Squidster" Chia - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + Copyright (c) 2008-2013 Charlie Savage and contributors + Copyright (c) 2002-2007 Sean Chittenden and contributors + Copyright (c) 2001 Wai-Sun "Squidster" Chia + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/Rakefile b/Rakefile index 3f3ba06c..d917ce5f 100644 --- a/Rakefile +++ b/Rakefile @@ -3,7 +3,6 @@ require "rubygems" require "rake/extensiontask" require "rake/testtask" -require "ruby_memcheck" require "rubygems/package_task" require "rdoc/task" require "yaml" @@ -16,14 +15,6 @@ spec = Gem::Specification.load("#{GEM_NAME}.gemspec") task :default => [:test] -test_config = lambda do |t| - t.libs << "test" - t.test_files = FileList['test/test*.rb'] - ['test/test_suite.rb'] - t.verbose = true -end - -RubyMemcheck.config(binary_name: SO_NAME) - # Setup compile tasks Rake::ExtensionTask.new do |ext| ext.gem_spec = spec @@ -81,8 +72,8 @@ RDoc::Task.new("rdoc") do |rdoc| end # Test Task -Rake::TestTask.new(&test_config) - -namespace :test do - RubyMemcheck::TestTask.new(valgrind: :compile, &test_config) -end +Rake::TestTask.new do |t| + t.libs << "test" + t.test_files = FileList['test/test*.rb'] - ['test/test_suite.rb'] + t.verbose = true +end \ No newline at end of file diff --git a/libxml-ruby.gemspec b/libxml-ruby.gemspec index a196d5e4..bc87b2a8 100644 --- a/libxml-ruby.gemspec +++ b/libxml-ruby.gemspec @@ -47,6 +47,5 @@ Gem::Specification.new do |spec| spec.add_development_dependency('rake-compiler') spec.add_development_dependency('minitest') spec.add_development_dependency('rdoc') - spec.add_development_dependency('ruby_memcheck') spec.license = 'MIT' end diff --git a/test/test_xpath.rb b/test/test_xpath.rb index 1daef685..75aec3d6 100644 --- a/test/test_xpath.rb +++ b/test/test_xpath.rb @@ -5,13 +5,11 @@ class TestXPath < Minitest::Test def setup - GC.stress = true @doc = LibXML::XML::Document.file(File.join(File.dirname(__FILE__), 'model/soap.xml')) end def teardown @doc = nil - GC.stress = false end def test_doc_find @@ -32,11 +30,15 @@ def test_ns end def test_ns_gc + _stress = GC.stress + GC.stress = true + doc = LibXML::XML::Document.string('') node = doc.root - doc = nil # This line segfaults on prior versions of libxml-ruby node.find("namespace::*") + + GC.stress = _stress end def test_ns_array @@ -154,11 +156,11 @@ def test_memory # is free, it iterates over its results which are pointers # to the document's nodes. A segmentation fault then happens. - 10.times do + 1000.times do doc = LibXML::XML::Document.new('1.0') doc.root = LibXML::XML::Node.new("header") - 10.times do + 1000.times do doc.root << LibXML::XML::Node.new("footer") end From 682d7b20d31fd43cdad8bbd418b9413cabd85225 Mon Sep 17 00:00:00 2001 From: Charlie Savage Date: Tue, 7 Apr 2026 00:45:37 -0700 Subject: [PATCH 12/16] Remove dead Float::INFINITY definition for Ruby 1.8.7 compatibility. --- ext/libxml/ruby_xml_schema_type.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/ext/libxml/ruby_xml_schema_type.c b/ext/libxml/ruby_xml_schema_type.c index d26ee0e9..a70a067a 100644 --- a/ext/libxml/ruby_xml_schema_type.c +++ b/ext/libxml/ruby_xml_schema_type.c @@ -229,13 +229,6 @@ static VALUE rxml_schema_type_attributes(VALUE self) void rxml_init_schema_type(void) { - /* Add in infinity support for ruby 1.8.7 */ - #if !defined(RUBY_VM) && defined(INFINITY) - ID infinityId = rb_intern("INFINITY"); - if (rb_const_defined(rb_cFloat, infinityId) == Qfalse) - rb_define_const(rb_cFloat, "INFINITY", rb_float_new(INFINITY)); - #endif - cXMLSchemaType = rb_define_class_under(cXMLSchema, "Type", rb_cObject); rb_undef_alloc_func(cXMLSchemaType); From 459042ec6c7e740699b5c60ea68be1b910854c97 Mon Sep 17 00:00:00 2001 From: Charlie Savage Date: Tue, 7 Apr 2026 00:54:30 -0700 Subject: [PATCH 13/16] Switch rdoc from hanna to aliki, update for renamed files, exclude top-level include wrappers. --- Rakefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Rakefile b/Rakefile index d917ce5f..a8954a9c 100644 --- a/Rakefile +++ b/Rakefile @@ -54,21 +54,21 @@ desc 'Generate rdoc documentation' RDoc::Task.new("rdoc") do |rdoc| rdoc.rdoc_dir = 'rdoc' rdoc.title = 'LibXML' - rdoc.generator = 'hanna' + rdoc.generator = 'aliki' # Show source inline with line numbers rdoc.options << '--line-numbers' rdoc.options << '--charset=utf-8' # Make the readme file the start page for the generated html - rdoc.main = 'README.rdoc' - rdoc.rdoc_files.include('doc/*.rdoc', - 'ext/**/libxml.c', + rdoc.main = 'README.md' + rdoc.rdoc_files.include('ext/**/libxml.c', 'ext/**/ruby_xml.c', 'ext/**/*.c', 'lib/**/*.rb', - 'README.rdoc', - 'HISTORY', + 'README.md', + 'CHANGELOG.md', 'LICENSE') + rdoc.rdoc_files.exclude('lib/xml.rb', 'lib/xml/libxml.rb') end # Test Task From 6f8e94251892a7ef76c3ed213ad0ae76702e0a6a Mon Sep 17 00:00:00 2001 From: Charlie Savage Date: Tue, 7 Apr 2026 01:04:03 -0700 Subject: [PATCH 14/16] Convert README to markdown, remove benchmarks, update documentation links. --- README.md | 372 ++++++++++++++++++++++++------------------------------ 1 file changed, 164 insertions(+), 208 deletions(-) diff --git a/README.md b/README.md index c995b066..27adb6f5 100644 --- a/README.md +++ b/README.md @@ -1,208 +1,164 @@ -= LibXML Ruby - -== Overview -The libxml gem provides Ruby language bindings for GNOME's Libxml2 -XML toolkit. It is free software, released under the MIT License. - -We think libxml-ruby is the best XML library for Ruby because: - -* Speed - Its much faster than REXML and Hpricot -* Features - It provides an amazing number of featues -* Conformance - It passes all 1800+ tests from the OASIS XML Tests Suite - -== Requirements -libxml-ruby requires Ruby 3.0.0 or higher. It depends on libxml2 to -function properly. libxml2, in turn, depends on: - -* libm (math routines: very standard) -* libz (zlib) -* libiconv - -If you are running Linux or Unix you'll need a C compiler so the -extension can be compiled when it is installed. If you are running -Windows, then install the x64-mingw-ucr gem or build it yourself using (Ruby -for Windows)[https://rubyinstaller.org/] or directly with msys2[https://msys2.github.io/] -and ucrt64. - -== Installation -The easiest way to install libxml-ruby is via RubyGems. To install: - -gem install libxml-ruby - -If the extension compile process cannot find libxml2, you may need to indicate -the location of the libxml2 configuration utility as it is used to find the -required header and include files. (If you need to indicate a location for the -libxml2 library or header files different than reported by xml2-config, -see the additional configuration options.) - -This may be done with RubyGems: - -gem install libxml-ruby -- --with-xml2-dir=/path/to/xml2-config - -Or bundler: - -bundle config build.libxml-ruby --with-xml2-config=/path/to/xml2-config - -bundle install libxml-ruby - -If you are running Windows, then install the libxml-ruby-x64-mingw32 gem. -The gem includes prebuilt extensions for Ruby 3.2 and 3.3. - -The gem also includes a Microsoft VC++ solution and XCode project - these -are very useful for debugging. - -libxml-ruby's source codes lives on GitHub[https://github.com/xml4r/libxml-ruby]. - -== Getting Started -Using libxml is easy. First decide what parser you want to use: - -* Generally you'll want to use the LibXML::XML::Parser which provides a tree based API. -* For larger documents that don't fit into memory, or if you prefer an input based API, use the LibXML::XML::Reader. -* To parse HTML files use LibXML::XML::HTMLParser. -* If you are masochistic, then use the LibXML::XML::SaxParser, which provides a callback API. - -Once you have chosen a parser, choose a datasource. Libxml can parse files, strings, URIs -and IO streams. For each data source you can specify an LibXML::XML::Encoding, a base uri and -various parser options. For more information, refer the LibXML::XML::Parser.document, -LibXML::XML::Parser.file, LibXML::XML::Parser.io or LibXML:::XML::Parser.string methods (the -same methods are defined on all four parser classes). - -== Advanced Functionality -Beyond the basics of parsing and processing XML and HTML documents, -libxml provides a wealth of additional functionality. - -Most commonly, you'll want to use its LibXML::XML::XPath support, which makes -it easy to find data inside an XML document. Although not as popular, -LibXML::XML::XPointer provides another API for finding data inside an XML document. - -Often times you'll need to validate data before processing it. For example, -if you accept user generated content submitted over the Web, you'll -want to verify that it does not contain malicious code such as embedded scripts. -This can be done using libxml's powerful set of validators: - -* DTDs (LibXML::XML::Dtd) -* Relax Schemas (LibXML::XML::RelaxNG) -* XML Schema (LibXML::XML::Schema) - -Finally, if you'd like to use XSL Transformations to process data, then install -the {libxslt gem}[https://github.com/xml4r/libxslt-rubygem]. - -== Usage -For information about using libxml-ruby please refer to its -documentation[https://xml4r.github.io/libxml-ruby]. Some tutorials are also -available[https://github.com/xml4r/libxml-ruby/wiki]. - -All libxml classes are in the LibXML::XML module. The easiest -way to use libxml is to require 'xml'. This will mixin -the LibXML module into the global namespace, allowing you to -write code like this: - - require 'xml' - document = XML::Document.new - -However, when creating an application or library you plan to -redistribute, it is best to not add the LibXML module to the global -namespace, in which case you can either write your code like this: - - require 'libxml' - document = LibXML::XML::Document.new - -Or you can utilize a namespace for your own work and include LibXML into it. -For example: - - require 'libxml' - - module MyApplication - include LibXML - - class MyClass - def some_method - document = XML::Document.new - end - end - end - -For simplicity's sake, the documentation uses the xml module in its examples. - -== Tests - -To run tests you first need to build the shared libary: - - rake compile - -Once you have build the shared libary, you can then run tests using rake: - - rake test - -+Build status: {rdoc-image:https://github.com/xml4r/libxml-ruby/actions/workflows/mri.yml/badge.svg}[https://github.com/xml4r/libxml-ruby/actions/workflows/mri.yml] - -== Performance - -In addition to being feature rich and conformation, the main reason -people use libxml-ruby is for performance. Here are the results -of a couple simple benchmarks recently blogged about on the -Web (you can find them in the benchmark directory of the -libxml distribution). - -From http://depixelate.com/2008/4/23/ruby-xml-parsing-benchmarks - - user system total real - libxml 0.032000 0.000000 0.032000 ( 0.031000) - Hpricot 0.640000 0.031000 0.671000 ( 0.890000) - REXML 1.813000 0.047000 1.860000 ( 2.031000) - -From https://svn.concord.org/svn/projects/trunk/common/ruby/xml_benchmarks/ - - user system total real - libxml 0.641000 0.031000 0.672000 ( 0.672000) - hpricot 5.359000 0.062000 5.421000 ( 5.516000) - rexml 22.859000 0.047000 22.906000 ( 23.203000) - - -== Documentation -Documentation is available via rdoc, and is installed automatically with the -gem. - -libxml-ruby's {online -documentation}[https://xml4r.github.io/libxml-ruby/rdoc/index.html] is generated -using Hanna, which is a development gem dependency. - -Note that older versions of Rdoc, which ship with Ruby 1.8.x, will report -a number of errors. To avoid them, install Rdoc 2.1 or higher. Once you have -installed the gem, you'll have to disable the version of Rdoc that Ruby 1.8.x -includes. An easy way to do that is rename the directory -ruby/lib/ruby/1.8/rdoc to -ruby/lib/ruby/1.8/rdoc_old. - -== Support -If you have any questions about using libxml-ruby, please report an issue -on GitHub[https://github.com/xml4r/libxml-ruby/issues]. - -== Memory Management -libxml-ruby automatically manages memory associated with the -underlying libxml2 library. The bindings create a one-to-one mapping between -Ruby objects and libxml documents and libxml parent nodes (ie, nodes that do not -have a parent and do not belong to a document). In these cases, -the bindings manage the memory. They do this by installing a free -function and storing a back pointer to the Ruby object from the xmlnode -using the _private member on libxml structures. When the Ruby object -goes out of scope, the underlying libxml structure is freed. Libxml -itself then frees all child nodes (recursively). - -For all other nodes (the vast majority), the bindings create temporary -Ruby objects that get freed once they go out of scope. Thus there can be -more than one Ruby object pointing to the same xml node. To mostly hide -this from a programmer on the Ruby side, the #eql? and #== methods are -overriden to check if two Ruby objects wrap the same xmlnode. If they do, -then the methods return true. During the mark phase, each of these temporary -objects marks its owning document, thereby keeping the Ruby document object -alive and thus the xmldoc tree. - -In the sweep phase of the garbage collector, or when a program ends, -there is no order to how Ruby objects are freed. In fact, the Ruby document -object is almost always freed before any Ruby objects that wrap child nodes. -However, this is ok because those Ruby objects do not have a free function -and are no longer in scope (since if they were the document would not be freed). - -== License -See LICENSE for license information. +# LibXML Ruby + +## Overview +The libxml gem provides Ruby language bindings for GNOME's Libxml2 +XML toolkit. It is free software, released under the MIT License. + +We think libxml-ruby is the best XML library for Ruby because: + +* Speed - It's much faster than REXML +* Features - It provides an amazing number of features +* Conformance - It passes all 1800+ tests from the OASIS XML Tests Suite + +## Requirements +libxml-ruby requires Ruby 3.2 or higher. It depends on libxml2 to +function properly. libxml2, in turn, depends on: + +* libm (math routines: very standard) +* libz (zlib) +* libiconv + +If you are running Linux or Unix you'll need a C compiler so the +extension can be compiled when it is installed. If you are running +Windows, then install the x64-mingw-ucr gem or build it yourself using +[Ruby for Windows](https://rubyinstaller.org/) or directly with +[msys2](https://msys2.github.io/) and ucrt64. + +## Installation +The easiest way to install libxml-ruby is via RubyGems. To install: + +``` +gem install libxml-ruby +``` + +If the extension compile process cannot find libxml2, you may need to indicate +the location of the libxml2 configuration utility as it is used to find the +required header and include files. (If you need to indicate a location for the +libxml2 library or header files different than reported by `xml2-config`, +see the additional configuration options.) + +This may be done with RubyGems: + +``` +gem install libxml-ruby -- --with-xml2-dir=/path/to/xml2-config +``` + +Or bundler: + +``` +bundle config build.libxml-ruby --with-xml2-config=/path/to/xml2-config +bundle install libxml-ruby +``` + +If you are running Windows, then install the libxml-ruby-x64-mingw32 gem. +The gem includes prebuilt extensions for Ruby 3.2 and 3.3. + +The gem also includes a Microsoft VC++ solution and XCode project - these +are very useful for debugging. + +libxml-ruby's source code lives on [GitHub](https://github.com/xml4r/libxml-ruby). + +## Getting Started +Using libxml is easy. First decide what parser you want to use: + +* Generally you'll want to use the `LibXML::XML::Parser` which provides a tree based API. +* For larger documents that don't fit into memory, or if you prefer an input based API, use the `LibXML::XML::Reader`. +* To parse HTML files use `LibXML::XML::HTMLParser`. +* If you are masochistic, then use the `LibXML::XML::SaxParser`, which provides a callback API. + +Once you have chosen a parser, choose a datasource. Libxml can parse files, strings, URIs +and IO streams. For each data source you can specify an `LibXML::XML::Encoding`, a base uri and +various parser options. For more information, refer the `LibXML::XML::Parser.document`, +`LibXML::XML::Parser.file`, `LibXML::XML::Parser.io` or `LibXML::XML::Parser.string` methods (the +same methods are defined on all four parser classes). + +## Advanced Functionality +Beyond the basics of parsing and processing XML and HTML documents, +libxml provides a wealth of additional functionality. + +Most commonly, you'll want to use its `LibXML::XML::XPath` support, which makes +it easy to find data inside an XML document. Although not as popular, +`LibXML::XML::XPointer` provides another API for finding data inside an XML document. + +Often times you'll need to validate data before processing it. For example, +if you accept user generated content submitted over the Web, you'll +want to verify that it does not contain malicious code such as embedded scripts. +This can be done using libxml's powerful set of validators: + +* DTDs (`LibXML::XML::Dtd`) +* Relax Schemas (`LibXML::XML::RelaxNG`) +* XML Schema (`LibXML::XML::Schema`) + +Finally, if you'd like to use XSL Transformations to process data, then install +the [libxslt gem](https://github.com/xml4r/libxslt-ruby). + +## Usage +For information about using libxml-ruby please refer to its +[documentation](https://xml4r.github.io/libxml-ruby/). + +All libxml classes are in the `LibXML::XML` module. The easiest +way to use libxml is to `require 'xml'`. This will mixin +the LibXML module into the global namespace, allowing you to +write code like this: + +```ruby +require 'xml' +document = XML::Document.new +``` + +However, when creating an application or library you plan to +redistribute, it is best to not add the LibXML module to the global +namespace, in which case you can either write your code like this: + +```ruby +require 'libxml' +document = LibXML::XML::Document.new +``` + +Or you can utilize a namespace for your own work and include LibXML into it. +For example: + +```ruby +require 'libxml' + +module MyApplication + include LibXML + + class MyClass + def some_method + document = XML::Document.new + end + end +end +``` + +For simplicity's sake, the documentation uses the xml module in its examples. + +## Tests + +To run tests you first need to build the shared library: + +``` +rake compile +``` + +Once you have built the shared library, you can then run tests using rake: + +``` +rake test +``` + +[![Build Status](https://github.com/xml4r/libxml-ruby/actions/workflows/mri.yml/badge.svg)](https://github.com/xml4r/libxml-ruby/actions/workflows/mri.yml) + +## Documentation +Documentation is available at [xml4r.github.io/libxml-ruby](https://xml4r.github.io/libxml-ruby/). + +API reference documentation is generated via rdoc and is available at +[xml4r.github.io/libxml-ruby/reference](https://xml4r.github.io/libxml-ruby/reference/). + +## Support +If you have any questions about using libxml-ruby, please report an issue +on [GitHub](https://github.com/xml4r/libxml-ruby/issues). + +## License +See [LICENSE](LICENSE) for license information. From c042ec2a1fbc89ca9f6a6d0e40cf172645b18c91 Mon Sep 17 00:00:00 2001 From: Charlie Savage Date: Tue, 7 Apr 2026 01:04:26 -0700 Subject: [PATCH 15/16] Add site and rdoc output directories to .gitignore. --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 27721079..44baa692 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,8 @@ website/_site /ext/vc/libxml_ruby_19/Debug /ext/vc/libxml_ruby_18/Debug /doc +/site +/rdoc /tmp /.idea /ext/vc/ipch From 02e168fca0db74c4c8aab63d699a827b6d878d02 Mon Sep 17 00:00:00 2001 From: Charlie Savage Date: Tue, 7 Apr 2026 01:05:03 -0700 Subject: [PATCH 16/16] Add documentation site with zensical and rdoc API reference. --- .github/workflows/docs.yml | 66 ++++++++++ docs/architecture/memory.md | 88 +++++++++++++ docs/architecture/registry.md | 75 +++++++++++ docs/getting_started.md | 88 +++++++++++++ docs/index.md | 35 ++++++ docs/installation.md | 60 +++++++++ docs/parsing/dom.md | 104 +++++++++++++++ docs/parsing/html.md | 97 ++++++++++++++ docs/parsing/overview.md | 81 ++++++++++++ docs/parsing/reader.md | 166 ++++++++++++++++++++++++ docs/parsing/sax.md | 152 ++++++++++++++++++++++ docs/validation/dtd.md | 33 +++++ docs/validation/relaxng.md | 28 +++++ docs/validation/schema.md | 40 ++++++ docs/xml/attributes.md | 47 +++++++ docs/xml/documents.md | 120 ++++++++++++++++++ docs/xml/encoding.md | 42 +++++++ docs/xml/errors.md | 55 ++++++++ docs/xml/namespaces.md | 47 +++++++ docs/xml/nodes.md | 229 ++++++++++++++++++++++++++++++++++ docs/xml/writer.md | 81 ++++++++++++ docs/xpath/context.md | 73 +++++++++++ docs/xpath/namespaces.md | 94 ++++++++++++++ docs/xpath/overview.md | 156 +++++++++++++++++++++++ zensical.toml | 97 ++++++++++++++ 25 files changed, 2154 insertions(+) create mode 100644 .github/workflows/docs.yml create mode 100644 docs/architecture/memory.md create mode 100644 docs/architecture/registry.md create mode 100644 docs/getting_started.md create mode 100644 docs/index.md create mode 100644 docs/installation.md create mode 100644 docs/parsing/dom.md create mode 100644 docs/parsing/html.md create mode 100644 docs/parsing/overview.md create mode 100644 docs/parsing/reader.md create mode 100644 docs/parsing/sax.md create mode 100644 docs/validation/dtd.md create mode 100644 docs/validation/relaxng.md create mode 100644 docs/validation/schema.md create mode 100644 docs/xml/attributes.md create mode 100644 docs/xml/documents.md create mode 100644 docs/xml/encoding.md create mode 100644 docs/xml/errors.md create mode 100644 docs/xml/namespaces.md create mode 100644 docs/xml/nodes.md create mode 100644 docs/xml/writer.md create mode 100644 docs/xpath/context.md create mode 100644 docs/xpath/namespaces.md create mode 100644 docs/xpath/overview.md create mode 100644 zensical.toml diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 00000000..a84cc9c7 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,66 @@ +name: Deploy Docs + +on: + push: + branches: [master] + paths: + - 'docs/**' + - 'lib/**' + - 'ext/**' + - 'zensical.toml' + - '.github/workflows/docs.yml' + workflow_dispatch: + +permissions: + contents: read + pages: write + id-token: write + +concurrency: + group: pages + cancel-in-progress: false + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Set up Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: '4.0' + bundler-cache: false + + - name: Install zensical + run: pip install zensical + + - name: Copy changelog to docs + run: cp CHANGELOG.md docs/changelog.md + + - name: Build guide docs + run: zensical build --clean + + - name: Build API reference + run: rdoc --format aliki --output site/reference --title 'LibXML Ruby API' --line-numbers --charset=utf-8 --exclude lib/xml.rb --exclude lib/xml/libxml.rb --main README.md ext/**/libxml.c ext/**/ruby_xml.c ext/**/*.c lib/**/*.rb README.md + + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + path: site + + deploy: + needs: build + runs-on: ubuntu-latest + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/docs/architecture/memory.md b/docs/architecture/memory.md new file mode 100644 index 00000000..d6d30182 --- /dev/null +++ b/docs/architecture/memory.md @@ -0,0 +1,88 @@ +# Memory Management + +libxml-ruby automatically manages memory for the underlying libxml2 C library. This page explains the ownership model and how the bindings keep Ruby objects and libxml2 C structures in sync. + +## Ownership Model + +libxml2 has a simple ownership rule: an `xmlDocPtr` owns the tree attached to it, and `xmlFreeDoc` frees the document and the entire attached tree. When code unlinks a node with `xmlUnlinkNode`, that detached subtree is no longer document-owned and must either be reattached or freed with `xmlFreeNode`. + +libxml-ruby sits on top of that model. In the normal case, the document is the owner. Ruby node and attr objects do not own the libxml node or attr they point at. They are references into libxml-owned memory, and their mark functions keep the owning Ruby document alive while Ruby still has live references into the tree. + +In the diagram below: + +- solid lines mean `owns` +- blue dashed lines mean `references` a libxml C object +- red dashed lines mean `mark`, which is a Ruby-to-Ruby GC reference + +```mermaid +flowchart TB + DocWrap["Ruby XML::Document"] + XDoc["xmlDocPtr"] + NodeWrap["Ruby XML::Node"] + XNode["xmlNodePtr"] + AttrWrap["Ruby XML::Attr"] + XAttr["xmlAttrPtr"] + + DocWrap -->|owns| XDoc + XDoc -->|owns| XNode + XNode -->|owns| XAttr + + NodeWrap -.references.-> XNode + AttrWrap -.references.-> XAttr + NodeWrap -.mark.-> DocWrap + AttrWrap -.mark.-> DocWrap + DocWrap ~~~ NodeWrap + DocWrap ~~~ AttrWrap + NodeWrap ~~~ AttrWrap + + classDef ruby fill:#f4a0a0,stroke:#8b1f1b,stroke-width:2px; + classDef xml fill:#e8f1ff,stroke:#5b84c4,stroke-width:2px; + class DocWrap,NodeWrap,AttrWrap ruby; + class XDoc,XNode,XAttr xml; + linkStyle 3,4 stroke:#5b84c4,stroke-width:2px,stroke-dasharray: 6 4; + linkStyle 5,6 stroke:#cc342d,stroke-width:2px,stroke-dasharray: 6 4; +``` + +The solid ownership chain is the important part. `XML::Document` owns the `xmlDocPtr`. The `xmlDocPtr` owns the tree, and the `xmlNodePtr` owns its attrs. The dashed lines are references, not ownership. The blue dashed edges mean Ruby objects reference libxml objects. The red dashed `mark` edges mean a live Ruby node or attr keeps the Ruby document alive during GC so the underlying tree is not freed while Ruby still references it. + +## Detached Root Nodes + +[Detached nodes](../xml/nodes.md#detached-nodes) are the one exception to the document-owns-everything model. A newly created node is Ruby-owned until it is inserted into a document tree. Removing a node transfers ownership back to Ruby. + +Internally, this is managed by `rxml_node_manage` (Ruby takes ownership), `rxml_node_unmanage` (libxml takes ownership), and `rxml_node_free` (frees a detached node on GC). + +## Object Identity + +Because temporary wrappers are created on demand, accessing the same node twice may return different Ruby objects: + +```ruby +child1 = node.children[0] +child2 = node.children[0] + +child1 == child2 # => true (same underlying node) +child1.equal?(child2) # => false (different Ruby objects) +``` + +Use `==` or `eql?` to compare nodes, not `equal?`. + +Documents and detached root nodes do maintain identity through the [registry](registry.md) — retrieving the same document or detached root always returns the same Ruby object. + +## Preventing Premature Collection + +Keep a reference to the document (or a managed root node) as long as you use any of its nodes: + +```ruby +# Safe - doc stays in scope +doc = XML::Parser.file('data.xml').parse +nodes = doc.find('//item') +nodes.each { |n| process(n) } + +# Risky - doc may be collected +nodes = XML::Parser.file('data.xml').parse.find('//item') +GC.start # doc could be freed here +nodes.first.name # potential crash +``` + +## GC Sweep Order + +During garbage collection (or at program exit), Ruby does not guarantee the order in which objects are freed. The document object is almost always freed before any child node wrappers. This is safe because child node wrappers are non-owning — they have no free function. The document's free function calls `xmlFreeDoc`, which recursively frees the entire tree. The child wrappers simply become stale and are collected without action. diff --git a/docs/architecture/registry.md b/docs/architecture/registry.md new file mode 100644 index 00000000..3a42f371 --- /dev/null +++ b/docs/architecture/registry.md @@ -0,0 +1,75 @@ +# Pointer Registry + +The bindings need to map libxml2 C pointers back to their Ruby wrapper objects. This is used for two purposes: + +1. **Object identity** - returning the same Ruby object when the same C pointer is encountered again (documents and detached root nodes) +2. **GC reachability** - mark functions look up the owning Ruby document to keep it alive while Ruby references exist into the tree + +## Design + +The registry is a pointer-keyed `st_table` in `ruby_xml_registry.c` with three operations: + +```c +void rxml_registry_register(void *ptr, VALUE obj); +void rxml_registry_unregister(void *ptr); +VALUE rxml_registry_lookup(void *ptr); /* Qnil on miss */ +``` + +The registry is **not** a GC root. It does not keep objects alive. Objects stay alive through the normal mark chains — mark functions look up the registry instead of holding direct references. + +## What Gets Registered + +Only objects that own their underlying C structure are registered: + +| C pointer | Ruby wrapper | Registered when | +|-----------|-------------|-----------------| +| `xmlDocPtr` | `XML::Document` | Document is created or parsed | +| detached root `xmlNodePtr` | `XML::Node` | Node is created or detached via `remove!` | + +Document-owned child nodes are **not** registered. They are lightweight, non-owning wrappers that get fresh Ruby objects each time they are accessed. + +## How Mark Functions Use It + +When Ruby's GC runs the mark phase, node and attr mark functions look up the owning document through the registry: + +```mermaid +flowchart TD + Registry["internal registry"] + DocWrap["Ruby XML::Document"] + XDoc["xmlDocPtr"] + DetachedWrap["Detached Ruby XML::Node"] + DetachedNode["detached root xmlNodePtr"] + ChildWrap["Ruby XML::Node"] + ChildNode["document-owned xmlNodePtr"] + + DocWrap -->|owns| XDoc + XDoc -->|owns| ChildNode + DetachedWrap -->|owns| DetachedNode + + ChildWrap -.references.-> ChildNode + ChildWrap -.mark.-> DocWrap + + XDoc -.references.-> Registry + DetachedNode -.references.-> Registry + Registry -.references.-> DocWrap + Registry -.references.-> DetachedWrap + + classDef ruby fill:#f4a0a0,stroke:#8b1f1b,stroke-width:2px; + classDef xml fill:#e8f1ff,stroke:#5b84c4,stroke-width:2px; + classDef registry fill:#f5ebcf,stroke:#b89632,stroke-width:2px; + class DocWrap,DetachedWrap,ChildWrap ruby; + class XDoc,DetachedNode,ChildNode xml; + class Registry registry; + linkStyle 3,5,6 stroke:#5b84c4,stroke-width:2px,stroke-dasharray: 6 4; + linkStyle 4,7,8 stroke:#cc342d,stroke-width:2px,stroke-dasharray: 6 4; +``` + +For an attached node, the mark function reads `xnode->doc` (maintained by libxml2), looks up the document in the registry, and marks the Ruby document object. For a detached subtree, it walks to the root via parent pointers, looks up the root in the registry, and marks it. + +## Lifecycle + +Registered pointers must be unregistered before the underlying C structure is freed: + +- `rxml_document_free` unregisters the `xmlDocPtr` before calling `xmlFreeDoc` +- `rxml_node_free` unregisters the detached root before calling `xmlFreeNode` +- `rxml_node_unmanage` unregisters when a detached node is attached to a document (libxml takes ownership) diff --git a/docs/getting_started.md b/docs/getting_started.md new file mode 100644 index 00000000..9fbe11b0 --- /dev/null +++ b/docs/getting_started.md @@ -0,0 +1,88 @@ +# Getting Started + +## Requiring the Library + +There are several ways to load libxml-ruby: + +```ruby +# Recommended - keeps everything under the LibXML namespace +require 'libxml-ruby' +document = LibXML::XML::Document.new +``` + +```ruby +# Convenience - mixes LibXML into the global namespace +require 'xml' +document = XML::Document.new +``` + +```ruby +# In your own namespace +require 'libxml-ruby' + +module MyApp + include LibXML + + class Processor + def parse(file) + XML::Document.file(file) + end + end +end +``` + +## Choosing a Parser + +libxml-ruby provides four parsers, each suited to different use cases: + +| Parser | Best For | +|--------|----------| +| `XML::Parser` | General-purpose DOM parsing. Loads the entire document into a tree. | +| `XML::Reader` | Large documents that don't fit in memory. Pull-based streaming API. | +| `XML::HTMLParser` | Parsing HTML documents (tolerates malformed markup). | +| `XML::SaxParser` | Event-driven parsing with callbacks. | + +## Data Sources + +All parsers support multiple data sources: + +```ruby +# From a file +doc = XML::Parser.file('data.xml').parse + +# From a string +doc = XML::Parser.string('').parse + +# From an IO object +File.open('data.xml') do |io| + doc = XML::Parser.io(io).parse +end +``` + +## A Complete Example + +```ruby +require 'libxml-ruby' + +# Parse +doc = LibXML::XML::Document.file('books.xml') + +# Navigate +root = doc.root +puts root.name + +# Find nodes with XPath +doc.find('//book[@year > 2000]').each do |book| + title = book.find_first('title').content + puts title +end + +# Create new content +new_book = LibXML::XML::Node.new('book') +new_book['year'] = '2024' +new_book << LibXML::XML::Node.new('title', 'New Book') +root << new_book + +# Save +doc.save('books_updated.xml', indent: true) +``` diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..9a99b7c2 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,35 @@ +# libxml-ruby + +Ruby language bindings for the [GNOME Libxml2](http://xmlsoft.org/) XML toolkit. It is free software, released under the MIT License. + +libxml-ruby stands out because of: + +* **Speed** - Much faster than REXML +* **Features** - Full DOM, SAX, Reader, Writer, XPath, validation (DTD, RelaxNG, XML Schema) and more +* **Conformance** - Passes all 1800+ tests from the OASIS XML Tests Suite + +## Quick Example + +```ruby +require 'libxml-ruby' + +# Parse a document +doc = LibXML::XML::Document.file('books.xml') + +# Find nodes with XPath +doc.find('//book').each do |node| + puts node['title'] +end + +# Validate against a schema +schema = LibXML::XML::Schema.new('books.xsd') +doc.validate_schema(schema) +``` + +## Requirements + +libxml-ruby requires Ruby 3.2 or higher and depends on [libxml2](http://xmlsoft.org/). + +## License + +libxml-ruby is released under the [MIT License](https://github.com/xml4r/libxml-ruby/blob/master/LICENSE). diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 00000000..54578eea --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,60 @@ +# Installation + +## RubyGems + +``` +gem install libxml-ruby +``` + +## Bundler + +Add to your Gemfile: + +```ruby +gem 'libxml-ruby' +``` + +## Specifying libxml2 Location + +If the build process cannot find libxml2, specify the location of `xml2-config`: + +``` +gem install libxml-ruby -- --with-xml2-config=/path/to/xml2-config +``` + +Or with Bundler: + +``` +bundle config build.libxml-ruby --with-xml2-config=/path/to/xml2-config +bundle install +``` + +You can also specify directories directly: + +``` +--with-xml2-dir=/path/to/libxml2 +--with-xml2-lib=/path/to/libxml2/lib +--with-xml2-include=/path/to/libxml2/include +``` + +## Platform Notes + +### Linux / macOS + +A C compiler is required to build the native extension. libxml2 development headers must be installed: + +``` +# Debian/Ubuntu +sudo apt-get install libxml2-dev + +# macOS (Homebrew) +brew install libxml2 +``` + +### Windows + +Install the `libxml-ruby` gem with a MinGW toolchain via [RubyInstaller](https://rubyinstaller.org/) or [MSYS2](https://msys2.github.io/). + +## Source Code + +libxml-ruby's source code is on [GitHub](https://github.com/xml4r/libxml-ruby). diff --git a/docs/parsing/dom.md b/docs/parsing/dom.md new file mode 100644 index 00000000..c2996d47 --- /dev/null +++ b/docs/parsing/dom.md @@ -0,0 +1,104 @@ +# DOM Parser + +The DOM parser (`XML::Parser`) loads an entire XML document into memory as a tree of nodes. This is the most common way to work with XML in libxml-ruby. + +## Parsing + +```ruby +# From a file +doc = XML::Parser.file('books.xml').parse + +# From a string +doc = XML::Parser.string('').parse + +# From an IO +File.open('books.xml') do |io| + doc = XML::Parser.io(io).parse +end +``` + +## Example: Parse and Extract Data + +```ruby +xml = <<~XML + + + The Pragmatic Programmer + Dave Thomas + 1999 + + + Design Patterns + Gang of Four + 1994 + + +XML + +doc = XML::Parser.string(xml).parse + +# Access the root +root = doc.root +puts root.name # => "library" + +# Iterate over children +root.each do |book| + next unless book.element? + puts book.find_first('title').content +end + +# Use XPath +available = doc.find('//book[@available="true"]') +available.each do |book| + title = book.find_first('title').content + year = book.find_first('year').content + puts "#{title} (#{year})" +end +``` + +## Example: Parse a Configuration File + +```ruby +doc = XML::Parser.file('config.xml').parse + +db_host = doc.find_first('//database/host').content +db_port = doc.find_first('//database/port').content.to_i +db_name = doc.find_first('//database/name').content + +puts "Connecting to #{db_name} at #{db_host}:#{db_port}" +``` + +## Example: Parse with Options + +```ruby +# Strip whitespace-only text nodes and disable network access +parser = XML::Parser.file('data.xml') +parser.options = XML::Parser::Options::NOBLANKS | XML::Parser::Options::NONET +doc = parser.parse +``` + +## Example: Parse from a Web Response + +```ruby +require 'net/http' + +uri = URI('https://example.com/api/data.xml') +xml = Net::HTTP.get(uri) + +doc = XML::Parser.string(xml).parse +doc.find('//item').each do |item| + puts item.find_first('name').content +end +``` + +## Error Handling + +```ruby +begin + doc = XML::Parser.string(' e + puts "Parse failed: #{e.message}" +end +``` + +See [Error Handling](../xml/errors.md) for details on error properties and custom handlers. diff --git a/docs/parsing/html.md b/docs/parsing/html.md new file mode 100644 index 00000000..aac3a553 --- /dev/null +++ b/docs/parsing/html.md @@ -0,0 +1,97 @@ +# HTML Parser + +The `XML::HTMLParser` parses HTML documents, including malformed HTML that would fail strict XML parsing. It produces a DOM tree just like `XML::Parser`. + +## Parsing HTML + +```ruby +# From a file +doc = XML::HTMLParser.file('page.html').parse + +# From a string +doc = XML::HTMLParser.string('

Hello

').parse + +# From an IO +File.open('page.html') do |io| + doc = XML::HTMLParser.io(io).parse +end +``` + +## Example: Extract Links from HTML + +```ruby +html = <<~HTML + + + Example + Ruby + About + + +HTML + +doc = XML::HTMLParser.string(html).parse + +doc.find('//a[@href]').each do |link| + puts "#{link.content} -> #{link['href']}" +end +``` + +## Example: Extract Text Content + +```ruby +doc = XML::HTMLParser.file('article.html').parse + +# Get all paragraph text +doc.find('//p').each do |p| + puts p.content +end + +# Get the page title +title = doc.find_first('//title') +puts title.content if title +``` + +## Example: Parse a Table + +```ruby +doc = XML::HTMLParser.string(html).parse + +doc.find('//table//tr').each do |row| + cells = row.find('td').map(&:content) + puts cells.join(' | ') +end +``` + +## Handling Malformed HTML + +The HTML parser is lenient — it handles missing close tags, incorrect nesting, and other common HTML issues: + +```ruby +# This would fail as XML but parses fine as HTML +html = '

First

Second
BoldBoldItalic' + +doc = XML::HTMLParser.string(html).parse +doc.find('//p').each { |p| puts p.content } +``` + +## Options + +HTML parser options are on `XML::HTMLParser::Options`: + +```ruby +parser = XML::HTMLParser.string(html) +parser.options = XML::HTMLParser::Options::NOERROR | + XML::HTMLParser::Options::NOWARNING +doc = parser.parse +``` + +Suppressing errors and warnings is common with real-world HTML, which often triggers parser warnings. + +## Encoding + +Specify encoding when the HTML doesn't declare it: + +```ruby +doc = XML::HTMLParser.string(html, encoding: XML::Encoding::UTF_8).parse +``` diff --git a/docs/parsing/overview.md b/docs/parsing/overview.md new file mode 100644 index 00000000..f97e49cc --- /dev/null +++ b/docs/parsing/overview.md @@ -0,0 +1,81 @@ +# Parsing Overview + +libxml-ruby provides four parsers for reading XML and HTML content. Each parser supports files, strings, IO objects and URIs as data sources. + +## Parser Comparison + +| Parser | API Style | Memory | Use Case | +|--------|-----------|--------|----------| +| [DOM Parser](dom.md) | Tree | Loads entire document | Most common. Navigate and modify documents freely. | +| [Reader](reader.md) | Pull/cursor | Streaming | Large documents. Move forward through nodes one at a time. | +| [SAX Parser](sax.md) | Push/callback | Streaming | Event-driven processing. You define callbacks for each event. | +| [HTML Parser](html.md) | Tree | Loads entire document | Malformed HTML. Tolerates missing tags, bad nesting, etc. | + +## Choosing a Parser + +For most use cases, start with the **DOM Parser**. It loads the entire document into memory and gives you full access to navigate, query, and modify the tree. + +Use the **Reader** when the document is too large for memory, or when you only need to extract specific data in a single pass. + +Use the **SAX Parser** only if you need maximum control over the parsing events. The Reader is usually simpler for streaming. + +Use the **HTML Parser** when dealing with real-world HTML that may not be well-formed XML. + +## Data Sources + +All parsers support the same data sources: + +```ruby +# From a file +doc = XML::Parser.file('data.xml').parse + +# From a string +doc = XML::Parser.string('').parse + +# From an IO object +File.open('data.xml') do |io| + doc = XML::Parser.io(io).parse +end +``` + +## Parser Options + +Options control parsing behavior. They are constants on `XML::Parser::Options` and can be combined with bitwise OR: + +```ruby +parser = XML::Parser.file('data.xml') +parser.options = XML::Parser::Options::NOBLANKS | XML::Parser::Options::NONET +doc = parser.parse +``` + +Common options: + +| Option | Effect | +|--------|--------| +| `NOBLANKS` | Remove blank nodes (whitespace-only text between elements) | +| `NONET` | Disable network access (recommended for untrusted input) | +| `NOERROR` | Suppress error messages | +| `NOWARNING` | Suppress warning messages | +| `NOCDATA` | Merge CDATA as text nodes | +| `DTDLOAD` | Load the external DTD subset | +| `DTDVALID` | Validate with the DTD | +| `HUGE` | Relax hardcoded parser limits | + +## Security + +When parsing untrusted input, disable network access and entity expansion: + +```ruby +parser = XML::Parser.string(untrusted_xml) +parser.options = XML::Parser::Options::NONET | XML::Parser::Options::NOENT +doc = parser.parse +``` + +## Encoding + +Specify the encoding when the document doesn't declare it: + +```ruby +parser = XML::Parser.file('data.xml', encoding: XML::Encoding::UTF_8) +doc = parser.parse +``` diff --git a/docs/parsing/reader.md b/docs/parsing/reader.md new file mode 100644 index 00000000..b01294de --- /dev/null +++ b/docs/parsing/reader.md @@ -0,0 +1,166 @@ +# Reader + +The `XML::Reader` provides a pull-based streaming API for reading XML. It acts as a cursor moving forward through the document, stopping at each node. This is more memory efficient than DOM parsing for large documents. + +## Basic Usage + +```ruby +reader = XML::Reader.file('large.xml') + +while reader.read + if reader.node_type == XML::Reader::TYPE_ELEMENT + puts reader.name + end +end +``` + +## Node Properties + +At each position, the reader exposes the current node's properties: + +```ruby +reader.name # node name +reader.value # node value (for text, attributes) +reader.node_type # node type constant +reader.depth # nesting depth +reader.empty_element? # self-closing element? +reader.has_attributes? # has attributes? +reader.local_name # local name (without prefix) +reader.namespace_uri # namespace URI +reader.prefix # namespace prefix +``` + +## Node Type Constants + +```ruby +XML::Reader::TYPE_ELEMENT # opening tag +XML::Reader::TYPE_END_ELEMENT # closing tag +XML::Reader::TYPE_TEXT # text content +XML::Reader::TYPE_CDATA # CDATA section +XML::Reader::TYPE_COMMENT # comment +XML::Reader::TYPE_SIGNIFICANT_WHITESPACE +``` + +## Reading Attributes + +```ruby +reader = XML::Reader.string('') +reader.read + +reader['id'] # => "1" +reader.get_attribute('title') # => "Ruby" +reader.attribute_count # => 2 + +# Walk attributes +reader.move_to_first_attribute +puts "#{reader.name}=#{reader.value}" +while reader.move_to_next_attribute + puts "#{reader.name}=#{reader.value}" +end +reader.move_to_element # move back to the element +``` + +## Example: Extract Data from a Large File + +```ruby +reader = XML::Reader.file('products.xml') + +products = [] +while reader.read + if reader.node_type == XML::Reader::TYPE_ELEMENT && reader.name == 'product' + product = {} + product['id'] = reader['id'] + + # Read child elements + while reader.read + break if reader.node_type == XML::Reader::TYPE_END_ELEMENT && reader.name == 'product' + + if reader.node_type == XML::Reader::TYPE_ELEMENT + name = reader.name + reader.read # move to text content + product[name] = reader.value if reader.has_value? + end + end + + products << product + end +end + +products.each { |p| puts "#{p['name']}: $#{p['price']}" } +``` + +## Example: Count Elements + +```ruby +reader = XML::Reader.file('data.xml') +counts = Hash.new(0) + +while reader.read + if reader.node_type == XML::Reader::TYPE_ELEMENT + counts[reader.name] += 1 + end +end + +counts.sort_by { |_, v| -v }.each do |name, count| + puts "#{name}: #{count}" +end +``` + +## Navigating with next + +`reader.read` descends into child nodes. Use `reader.next` to skip to the next sibling, skipping the current node's subtree: + +```ruby +reader = XML::Reader.file('data.xml') + +while reader.read + if reader.node_type == XML::Reader::TYPE_ELEMENT && reader.name == 'skip_me' + reader.next # skip this element and its children + end +end +``` + +## Expanding Nodes + +You can expand the current node into a full DOM subtree for detailed inspection: + +```ruby +reader = XML::Reader.file('books.xml') +while reader.read + if reader.name == 'book' && reader.node_type == XML::Reader::TYPE_ELEMENT + node = reader.expand + + # Use XPath on the expanded node (requires reader.doc first) + reader.doc + title = node.find_first('title').content + puts title + end +end +``` + +!!! warning + Expanded nodes are only valid until the next `reader.read` call. Do not store references to them. + +## Validation While Reading + +The reader can validate against a schema as it reads: + +```ruby +reader = XML::Reader.file('data.xml') +reader.schema_validate('schema.xsd') + +while reader.read + # reader.valid? returns the validation state +end +``` + +Or with RelaxNG: + +```ruby +reader = XML::Reader.file('data.xml') +reader.relax_ng_validate('schema.rng') + +while reader.read + # process nodes +end +``` diff --git a/docs/parsing/sax.md b/docs/parsing/sax.md new file mode 100644 index 00000000..78458f27 --- /dev/null +++ b/docs/parsing/sax.md @@ -0,0 +1,152 @@ +# SAX Parser + +The SAX parser (`XML::SaxParser`) provides an event-driven, callback-based API for parsing XML. You define a handler class with callback methods that are invoked as the parser encounters elements, text, and other XML constructs. + +## Defining a Handler + +Include `XML::SaxParser::Callbacks` and define the callbacks you need: + +```ruby +class MyHandler + include XML::SaxParser::Callbacks + + def on_start_element_ns(name, attributes, prefix, uri, namespaces) + puts "Start: #{name}" + end + + def on_end_element_ns(name, prefix, uri) + puts "End: #{name}" + end + + def on_characters(chars) + puts "Text: #{chars}" unless chars.strip.empty? + end +end +``` + +## Parsing + +```ruby +handler = MyHandler.new + +parser = XML::SaxParser.file('data.xml') +parser.callbacks = handler +parser.parse +``` + +## Available Callbacks + +| Callback | Called When | +|----------|------------| +| `on_start_document` | Document begins | +| `on_end_document` | Document ends | +| `on_start_element_ns` | Opening tag | +| `on_end_element_ns` | Closing tag | +| `on_characters` | Text content | +| `on_cdata_block` | CDATA section | +| `on_comment` | Comment | +| `on_processing_instruction` | Processing instruction | +| `on_error` | Parse error | + +## Example: Extract All Text from a Document + +```ruby +class TextExtractor + include XML::SaxParser::Callbacks + + attr_reader :text + + def initialize + @text = [] + end + + def on_characters(chars) + stripped = chars.strip + @text << stripped unless stripped.empty? + end +end + +handler = TextExtractor.new +parser = XML::SaxParser.file('document.xml') +parser.callbacks = handler +parser.parse + +puts handler.text.join(' ') +``` + +## Example: Build a Hash from XML + +```ruby +class XMLToHash + include XML::SaxParser::Callbacks + + attr_reader :result + + def initialize + @result = {} + @stack = [] + @current_text = '' + end + + def on_start_element_ns(name, attributes, prefix, uri, namespaces) + @stack.push(name) + @current_text = '' + end + + def on_characters(chars) + @current_text << chars + end + + def on_end_element_ns(name, prefix, uri) + @stack.pop + stripped = @current_text.strip + @result[name] = stripped unless stripped.empty? + @current_text = '' + end +end + +handler = XMLToHash.new +parser = XML::SaxParser.string('Alicealice@example.com') +parser.callbacks = handler +parser.parse + +puts handler.result # => {"name"=>"Alice", "email"=>"alice@example.com"} +``` + +## Example: Count Elements by Name + +```ruby +class ElementCounter + include XML::SaxParser::Callbacks + + attr_reader :counts + + def initialize + @counts = Hash.new(0) + end + + def on_start_element_ns(name, attributes, prefix, uri, namespaces) + @counts[name] += 1 + end +end + +handler = ElementCounter.new +parser = XML::SaxParser.file('large_document.xml') +parser.callbacks = handler +parser.parse + +handler.counts.sort_by { |_, v| -v }.first(10).each do |name, count| + puts "#{name}: #{count}" +end +``` + +## When to Use SAX + +SAX parsing is useful when: + +- The document is too large for DOM parsing +- You only need to extract specific data +- You want to process the document in a single pass +- Memory usage is a concern + +For most streaming use cases, the [Reader](reader.md) provides a simpler API. diff --git a/docs/validation/dtd.md b/docs/validation/dtd.md new file mode 100644 index 00000000..ad7c9d20 --- /dev/null +++ b/docs/validation/dtd.md @@ -0,0 +1,33 @@ +# DTD Validation + +A Document Type Definition (DTD) defines the structure and allowed elements of an XML document. + +## Creating a DTD + +```ruby +# From a string +dtd = XML::Dtd.new(<<~DTD) + + +DTD + +# From public and system identifiers +dtd = XML::Dtd.new( + '-//W3C//DTD XHTML 1.0 Transitional//EN', + 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' +) +``` + +## Validating + +```ruby +doc = XML::Parser.file('data.xml').parse + +begin + doc.validate(dtd) +rescue XML::Error => e + puts e.message +end +``` + +The `validate` method returns `true` on success or raises `XML::Error` with details about the validation failure. diff --git a/docs/validation/relaxng.md b/docs/validation/relaxng.md new file mode 100644 index 00000000..98e2671b --- /dev/null +++ b/docs/validation/relaxng.md @@ -0,0 +1,28 @@ +# RelaxNG Validation + +RelaxNG is a schema language for XML that is simpler and more expressive than XML Schema for many use cases. + +## Loading a Schema + +```ruby +# From a file +schema = XML::RelaxNG.new('/path/to/schema.rng') + +# From a document +schema_doc = XML::Document.file('schema.rng') +schema = XML::RelaxNG.document(schema_doc) +``` + +## Validating + +```ruby +doc = XML::Parser.file('data.xml').parse + +begin + doc.validate_relaxng(schema) +rescue XML::Error => e + puts e.message +end +``` + +The `validate_relaxng` method returns `true` on success or raises `XML::Error` on failure. diff --git a/docs/validation/schema.md b/docs/validation/schema.md new file mode 100644 index 00000000..3ebaaa9e --- /dev/null +++ b/docs/validation/schema.md @@ -0,0 +1,40 @@ +# XML Schema Validation + +XML Schema (XSD) is the W3C standard for defining the structure, content, and data types of XML documents. + +## Loading a Schema + +```ruby +# From a file +schema = XML::Schema.new('/path/to/schema.xsd') + +# From a document +schema_doc = XML::Document.file('schema.xsd') +schema = XML::Schema.document(schema_doc) +``` + +## Validating + +```ruby +doc = XML::Parser.file('data.xml').parse + +begin + doc.validate_schema(schema) +rescue XML::Error => e + puts e.message +end +``` + +## Inspecting the Schema + +```ruby +schema.target_namespace + +schema.types.each do |name, type| + puts "Type: #{name}" +end + +schema.elements.each do |name, element| + puts "Element: #{name}" +end +``` diff --git a/docs/xml/attributes.md b/docs/xml/attributes.md new file mode 100644 index 00000000..dc8a7910 --- /dev/null +++ b/docs/xml/attributes.md @@ -0,0 +1,47 @@ +# Attributes + +Attributes are name-value pairs on element nodes. + +## Reading Attributes + +```ruby +doc = XML::Parser.string('').parse +node = doc.root + +# Hash-style access +node['id'] # => "1" +node['lang'] # => "en" + +# Attribute object +attr = node.attributes.get_attribute('id') +attr.name # => "id" +attr.value # => "1" +``` + +## Setting Attributes + +```ruby +node['category'] = 'fiction' +``` + +## Removing Attributes + +```ruby +attr = node.attributes.get_attribute('lang') +attr.remove! +``` + +## Iterating + +```ruby +node.attributes.each do |attr| + puts "#{attr.name} = #{attr.value}" +end +``` + +## Namespaced Attributes + +```ruby +attr = node.attributes.get_attribute_ns('http://www.w3.org/1999/xlink', 'href') +attr.value +``` diff --git a/docs/xml/documents.md b/docs/xml/documents.md new file mode 100644 index 00000000..cd045afa --- /dev/null +++ b/docs/xml/documents.md @@ -0,0 +1,120 @@ +# Documents + +An `XML::Document` represents an entire XML document. It holds the root node, encoding, version, and other document-level properties. + +## Creating Documents + +```ruby +doc = XML::Document.new +doc.root = XML::Node.new('catalog') +doc.encoding = XML::Encoding::UTF_8 +``` + +## Document Properties + +```ruby +doc = XML::Parser.file('data.xml').parse + +doc.root # root element node +doc.encoding # document encoding +doc.version # XML version ("1.0") +doc.url # source URL/filename +doc.node_type # XML::Node::DOCUMENT_NODE +``` + +## Saving + +```ruby +# To a file +doc.save('output.xml') +doc.save('output.xml', indent: true, encoding: XML::Encoding::UTF_8) + +# To a string +xml_string = doc.to_s +xml_string = doc.to_s(indent: true) +``` + +## Example: Parse, Modify, Save + +```ruby +doc = XML::Parser.file('config.xml').parse + +# Update a value +node = doc.find_first('//setting[@name="timeout"]') +node.content = '30' + +# Add a new setting +new_setting = XML::Node.new('setting', '100') +new_setting['name'] = 'max_retries' +doc.root << new_setting + +# Save +doc.save('config.xml', indent: true, encoding: XML::Encoding::UTF_8) +``` + +## Example: Create an XML Document + +```ruby +doc = XML::Document.new +doc.root = XML::Node.new('bookstore') + +book = XML::Node.new('book') +book['category'] = 'fiction' + +title = XML::Node.new('title', 'The Great Gatsby') +title['lang'] = 'en' +book << title + +book << XML::Node.new('author', 'F. Scott Fitzgerald') +book << XML::Node.new('year', '1925') +book << XML::Node.new('price', '10.99') + +doc.root << book + +puts doc.to_s(indent: true) +``` + +Output: + +```xml + + + + The Great Gatsby + F. Scott Fitzgerald + 1925 + 10.99 + + +``` + +## Example: Import a Node from Another Document + +```ruby +source = XML::Parser.file('source.xml').parse +target = XML::Parser.file('target.xml').parse + +# Copy a node from source into target +node = source.find_first('//item[@id="special"]') +imported = target.import(node) +target.root << imported +``` + +## Canonicalization + +```ruby +# C14N canonical form +canonical = doc.canonicalize +``` + +## XInclude + +Process XInclude directives in a document: + +```ruby +doc.xinclude +``` + +## Validation + +Documents can be validated against DTDs, RelaxNG schemas, and XML Schemas. See the [Validation](../validation/dtd.md) section. diff --git a/docs/xml/encoding.md b/docs/xml/encoding.md new file mode 100644 index 00000000..892caf02 --- /dev/null +++ b/docs/xml/encoding.md @@ -0,0 +1,42 @@ +# Encoding + +libxml-ruby handles character encoding conversion between libxml2 (which works in UTF-8 internally) and Ruby's encoding system. + +## Document Encoding + +```ruby +# Set encoding when creating a document +doc = XML::Document.new +doc.encoding = XML::Encoding::UTF_8 + +# Read encoding from a parsed document +doc = XML::Parser.file('data.xml').parse +doc.encoding # => XML::Encoding::UTF_8 +``` + +## Input Encoding + +Specify the encoding when parsing if it's not declared in the document: + +```ruby +parser = XML::Parser.file('data.xml', encoding: XML::Encoding::ISO_8859_1) +doc = parser.parse +``` + +## String Encoding + +Strings returned by libxml-ruby carry the appropriate Ruby encoding based on the document's encoding. Internal conversion is handled automatically. + +## Available Encodings + +Encoding constants are defined on `XML::Encoding`: + +```ruby +XML::Encoding::UTF_8 +XML::Encoding::UTF_16LE +XML::Encoding::UTF_16BE +XML::Encoding::ISO_8859_1 +XML::Encoding::ASCII +``` + +Use `XML::Encoding.to_s` to convert a constant to its string name, or `XML::Encoding.from_s` to look up a constant by name. diff --git a/docs/xml/errors.md b/docs/xml/errors.md new file mode 100644 index 00000000..902e0f64 --- /dev/null +++ b/docs/xml/errors.md @@ -0,0 +1,55 @@ +# Error Handling + +libxml-ruby reports errors through the `XML::Error` exception class. Errors from parsing, validation, and other operations all use this class. + +## Catching Errors + +```ruby +begin + doc = XML::Parser.string(' e + puts e.message +end +``` + +## Error Properties + +`XML::Error` provides detailed information: + +```ruby +rescue XML::Error => e + e.message # human-readable message + e.domain # error domain (e.g., XML::Error::PARSER) + e.code # error code + e.level # severity (WARNING, ERROR, FATAL) + e.file # source file + e.line # line number + e.node # related node (if available) + e.str1 # additional context +end +``` + +## Error Levels + +```ruby +XML::Error::NONE # no error +XML::Error::WARNING # warning +XML::Error::ERROR # recoverable error +XML::Error::FATAL # fatal error +``` + +## Custom Error Handler + +Set a global error handler to capture errors that don't raise exceptions (such as validation warnings): + +```ruby +XML::Error.set_handler do |error| + $stderr.puts "XML: #{error.message}" +end +``` + +To reset to the default handler: + +```ruby +XML::Error.reset_handler +``` diff --git a/docs/xml/namespaces.md b/docs/xml/namespaces.md new file mode 100644 index 00000000..f109301b --- /dev/null +++ b/docs/xml/namespaces.md @@ -0,0 +1,47 @@ +# Namespaces + +XML namespaces prevent naming conflicts by qualifying element and attribute names with a URI. + +## Reading Namespaces + +```ruby +doc = XML::Parser.string('').parse +node = doc.root + +node.namespaces.each do |ns| + puts "#{ns.prefix}: #{ns.href}" +end +``` + +## Namespace Properties + +```ruby +ns = node.namespaces.first +ns.prefix # => "dc" (nil for default namespace) +ns.href # => "http://purl.org/dc/elements/1.1/" +``` + +## Default Namespace + +```ruby +doc = XML::Parser.string('').parse +ns = doc.root.namespaces.default +ns.href # => "http://example.com" +``` + +## XPath with Namespaces + +Default namespaces require a prefix when using XPath. See [XPath](xpath.md) for details. + +```ruby +doc.find('//ns:element', 'ns:http://example.com') +``` + +## Setting a Default Prefix + +To simplify XPath queries on documents with a default namespace: + +```ruby +doc.root.namespaces.default_prefix = 'ns' +doc.find('//ns:element') +``` diff --git a/docs/xml/nodes.md b/docs/xml/nodes.md new file mode 100644 index 00000000..bc16312b --- /dev/null +++ b/docs/xml/nodes.md @@ -0,0 +1,229 @@ +# Nodes + +Nodes are the primary objects in an XML document tree. The `XML::Node` class represents elements, text, CDATA, comments, and other node types. + +## Node Types + +```ruby +XML::Node::ELEMENT_NODE # +XML::Node::ATTRIBUTE_NODE # attribute="value" +XML::Node::TEXT_NODE # text content +XML::Node::CDATA_SECTION_NODE # +XML::Node::COMMENT_NODE # +XML::Node::DOCUMENT_NODE # the document itself +XML::Node::PI_NODE # +``` + +## Creating Nodes + +```ruby +# Element +node = XML::Node.new('book') + +# Element with text content +node = XML::Node.new('title', 'Ruby Programming') + +# Special node types +cdata = XML::Node.new_cdata('raw content') +comment = XML::Node.new_comment('a comment') +pi = XML::Node.new_pi('xml-stylesheet', 'type="text/xsl" href="style.xsl"') +``` + +## Reading Node Properties + +```ruby +node.name # element name +node.content # text content (recursively) +node.node_type # node type constant +node.empty? # no children? +node.doc # owning document +node.line_num # line number in source +node.path # XPath path to this node +node.lang # xml:lang value +node.base_uri # base URI +``` + +## Traversal + +```ruby +node.parent # parent node +node.child # first child (alias: first) +node.last # last child +node.next # next sibling +node.prev # previous sibling + +# Iterate over direct children +node.each { |child| puts child.name } +``` + +### Example: Walk a Document + +```ruby +doc = XML::Parser.string(<<~XML).parse + + Ruby29.99 + Rails39.99 + +XML + +doc.root.each do |book| + next unless book.element? + title = book.find_first('title').content + price = book.find_first('price').content + puts "#{title}: $#{price}" +end +``` + +## Modifying Content + +### Changing Text + +```ruby +node = doc.find_first('//title') +node.content = 'New Title' +``` + +!!! note + Setting `content=` replaces all child nodes with a single text node. The value is automatically XML-escaped. + +### Renaming Elements + +```ruby +node.name = 'new_name' +``` + +### Setting Attributes + +```ruby +node['category'] = 'fiction' +node['id'] = '42' +``` + +## Adding Nodes + +### Append a Child + +```ruby +parent << XML::Node.new('child') +parent << XML::Node.new('item', 'text content') +parent << 'plain text' # adds a text node +``` + +### Insert Before/After + +```ruby +# Insert after a node +node.sibling = XML::Node.new('after_me') + +# Insert using next=/prev= +reference_node.next = XML::Node.new('after_reference') +reference_node.prev = XML::Node.new('before_reference') +``` + +### Example: Build a Document from Scratch + +```ruby +doc = XML::Document.new +doc.root = XML::Node.new('catalog') + +['Ruby', 'Rails', 'Sinatra'].each_with_index do |title, i| + book = XML::Node.new('book') + book['id'] = (i + 1).to_s + book << XML::Node.new('title', title) + book << XML::Node.new('in_stock', 'true') + doc.root << book +end + +puts doc.to_s(indent: true) +``` + +### Example: Add Elements from Data + +```ruby +data = [ + { name: 'Alice', role: 'admin' }, + { name: 'Bob', role: 'user' }, +] + +doc = XML::Document.new +doc.root = XML::Node.new('users') + +data.each do |user| + node = XML::Node.new('user') + node['role'] = user[:role] + node << XML::Node.new('name', user[:name]) + doc.root << node +end +``` + +## Removing Nodes + +```ruby +node.remove! +``` + +`remove!` detaches the node from its parent and document. The node becomes a detached root — you can reattach it elsewhere or let it be garbage collected. + +### Example: Remove Matching Nodes + +```ruby +doc.find('//item[@deprecated="true"]').each do |node| + node.remove! +end +``` + +## Moving Nodes + +To move a node, remove it and reattach it: + +```ruby +node = doc.find_first('//footnote') +node.remove! +doc.find_first('//appendix') << node +``` + +## Copying Nodes + +```ruby +shallow = node.copy(false) # element only, no children +deep = node.copy(true) # element and all descendants +``` + +### Example: Duplicate a Subtree + +```ruby +template = doc.find_first('//item[@id="template"]') +copy = template.copy(true) +copy['id'] = 'new_item' +template.parent << copy +``` + +## Detached Nodes + +A newly created node is not attached to any document — it is a *detached root node* owned by Ruby. When you insert it into a document tree, ownership transfers to the document. Removing it transfers ownership back to Ruby. + +```ruby +node = XML::Node.new('item') # detached, Ruby owns it +doc.root << node # attached, document owns it +node.remove! # detached again, Ruby owns it +``` + +If a detached node goes out of scope and is garbage collected, Ruby frees it (and all its children). Once attached, the document handles cleanup. + +## Comparing Nodes + +```ruby +# Same underlying libxml node? +node1.eql?(node2) +node1 == node2 +``` + +!!! note + Multiple Ruby objects can wrap the same underlying XML node. Use `eql?` or `==` to compare nodes, not `equal?`. + +## Serialization + +```ruby +node.to_s # XML string +node.to_s(indent: true) # indented XML string +``` diff --git a/docs/xml/writer.md b/docs/xml/writer.md new file mode 100644 index 00000000..2a1f9318 --- /dev/null +++ b/docs/xml/writer.md @@ -0,0 +1,81 @@ +# Writing XML + +The `XML::Writer` class provides a streaming API for generating XML. It writes XML content sequentially without building a DOM tree, making it memory efficient for large output. + +## Output Targets + +```ruby +# Write to a file +writer = XML::Writer.file('output.xml') + +# Write to a string +writer = XML::Writer.string + +# Write to an IO object +writer = XML::Writer.io(io_object) + +# Write to an in-memory document +writer = XML::Writer.document +``` + +## Generating XML + +```ruby +writer = XML::Writer.string + +writer.start_document +writer.start_element('catalog') + +writer.start_element('book') +writer.write_attribute('id', '1') +writer.write_element('title', 'Ruby Programming') +writer.write_element('author', 'Dave Thomas') +writer.end_element # + +writer.end_element # +writer.end_document + +puts writer.result +``` + +Output: + +```xml + + + + Ruby Programming + Dave Thomas + + +``` + +## Indentation + +Enable indentation for readable output: + +```ruby +writer.set_indent(true) +writer.set_indent_string(' ') +``` + +## CDATA and Comments + +```ruby +writer.write_cdata('raw ') +writer.write_comment('a comment') +``` + +## Document Writer + +`XML::Writer.document` returns a writer that builds an `XML::Document`: + +```ruby +writer = XML::Writer.document +writer.start_document +writer.start_element('root') +writer.end_element +writer.end_document + +doc = writer.result # => XML::Document +``` diff --git a/docs/xpath/context.md b/docs/xpath/context.md new file mode 100644 index 00000000..44a0f3de --- /dev/null +++ b/docs/xpath/context.md @@ -0,0 +1,73 @@ +# XPath Context + +An `XML::XPath::Context` holds the state for XPath evaluation — registered namespaces, the context node, and an optional object cache. For one-off queries, `doc.find` and `node.find` create a context automatically. For repeated queries, creating a context explicitly avoids redundant setup. + +## Creating a Context + +```ruby +context = XML::XPath::Context.new(doc) +``` + +## Registering Namespaces + +```ruby +context.register_namespace('atom', 'http://www.w3.org/2005/Atom') +context.register_namespace('dc', 'http://purl.org/dc/elements/1.1/') + +entries = context.find('//atom:entry') +``` + +You can also register from a hash, array, or directly from a node's namespace declarations: + +```ruby +# From a hash +context.register_namespaces('atom' => 'http://www.w3.org/2005/Atom') + +# From an array of "prefix:uri" strings +context.register_namespaces(['atom:http://www.w3.org/2005/Atom']) + +# From a node's namespace declarations +context.register_namespaces_from_node(doc.root) +``` + +## Setting the Context Node + +By default the context node is the document root. Change it to evaluate XPath relative to a different node: + +```ruby +context.node = doc.root.find_first('//chapter') +sections = context.find('section') # relative to chapter +``` + +## Caching + +Enable the XPath object cache to reuse internal XPath objects across queries, reducing allocations: + +```ruby +context.enable_cache # default cache size +context.enable_cache(100) # specific size + +# When done +context.disable_cache +``` + +## Return Types + +`context.find` returns different types depending on the XPath expression: + +| XPath expression | Ruby return type | +|-----------------|-----------------| +| `//book` | `XML::XPath::Object` (node set) | +| `count(//book)` | `Float` | +| `string(//title)` | `String` | +| `1 = 1` | `true` or `false` | + +An `XML::XPath::Object` for a node set is enumerable: + +```ruby +results = context.find('//book') +results.each { |node| puts node.name } +results.length +results.first +results[0] +``` diff --git a/docs/xpath/namespaces.md b/docs/xpath/namespaces.md new file mode 100644 index 00000000..aac61304 --- /dev/null +++ b/docs/xpath/namespaces.md @@ -0,0 +1,94 @@ +# XPath and Namespaces + +Namespaces are the most common source of confusion with XPath. An element in a namespace will not be found by a bare name — you must register a prefix and use it in the query. + +## The Problem + +Given this document: + +```xml + + My Feed + + First Post + + +``` + +This XPath will find nothing: + +```ruby +doc.find('//title') # => empty, because "title" is in the Atom namespace +``` + +## The Solution: Register a Prefix + +```ruby +# Pass a namespace string — "prefix:uri" +doc.find('//atom:title', 'atom:http://www.w3.org/2005/Atom') +``` + +The prefix you choose (`atom` here) does not have to match the document — it's just a local binding for the XPath query. The URI must match exactly. + +## Multiple Namespaces + +### Array Form + +```ruby +doc.find('//atom:entry/dc:creator', [ + 'atom:http://www.w3.org/2005/Atom', + 'dc:http://purl.org/dc/elements/1.1/' +]) +``` + +### Hash Form + +```ruby +doc.find('//atom:entry/dc:creator', { + 'atom' => 'http://www.w3.org/2005/Atom', + 'dc' => 'http://purl.org/dc/elements/1.1/' +}) +``` + +## Default Namespace Shortcut + +If you query the same namespace-heavy document repeatedly, set a default prefix: + +```ruby +doc.root.namespaces.default_prefix = 'atom' + +# Now queries automatically use the 'atom' prefix +doc.find('//atom:title') +doc.find('//atom:entry') +``` + +## Complex Namespace Example + +```xml + + + + + + + + + +``` + +```ruby +# soap: is defined on the root, so it's automatically available +doc.find('/soap:Envelope') + +# ns1: is defined deeper, register it explicitly +doc.find('//ns1:Item', 'ns1:http://domain.example.com') + +# The default namespace on getResponse needs a prefix +doc.find('//svc:getResponse', 'svc:http://services.example.com') + +# Combine multiple registrations +doc.find('/soap:Envelope/soap:Body/svc:getResponse/svc:IDList/ns1:Item', [ + 'svc:http://services.example.com', + 'ns1:http://domain.example.com' +]) +``` diff --git a/docs/xpath/overview.md b/docs/xpath/overview.md new file mode 100644 index 00000000..98548770 --- /dev/null +++ b/docs/xpath/overview.md @@ -0,0 +1,156 @@ +# XPath + +XPath is the primary way to find and extract data from XML documents in libxml-ruby. Unlike some other Ruby XML libraries, libxml-ruby does not support CSS selectors — XPath is the query language for all search operations. + +## Quick Reference + +```ruby +doc = XML::Parser.file('books.xml').parse + +# Find all matching nodes — returns XML::XPath::Object +nodes = doc.find('//book') + +# Find from a specific node +titles = doc.root.find('book/title') + +# Find the first match +node = doc.find_first('//book[@id="1"]') + +# XPath can return different types +doc.find('count(//book)') # => Float +doc.find('string(//title)') # => String +doc.find('1 = 1') # => true +``` + +## XPath Crash Course + +If you're new to XPath, here are the essentials. + +### Selecting Nodes + +| Expression | Selects | +|-----------|---------| +| `/root` | Root element named "root" | +| `/root/child` | Direct children named "child" | +| `//book` | All "book" elements anywhere in the document | +| `.` | Current node | +| `..` | Parent node | +| `@id` | Attribute named "id" | + +### Predicates (Filters) + +| Expression | Selects | +|-----------|---------| +| `//book[1]` | First book element | +| `//book[last()]` | Last book element | +| `//book[@id]` | Books with an "id" attribute | +| `//book[@id="42"]` | Books where id is "42" | +| `//book[price>10]` | Books where price child > 10 | + +### Axes + +| Expression | Selects | +|-----------|---------| +| `child::book` | Child elements named "book" (same as `book`) | +| `ancestor::catalog` | Ancestor elements named "catalog" | +| `following-sibling::*` | All following siblings | +| `preceding-sibling::*` | All preceding siblings | +| `descendant::*` | All descendants | +| `self::book` | Current node if it's named "book" | + +### Functions + +| Function | Returns | +|----------|---------| +| `count(//book)` | Number of matching nodes | +| `string(//title)` | Text content of first match | +| `contains(@class, 'active')` | True if attribute contains substring | +| `starts-with(name, 'J')` | True if string starts with prefix | +| `not(@disabled)` | Boolean negation | +| `position()` | Position of current node in set | +| `normalize-space(text())` | Trimmed, collapsed whitespace | + +### Combining Expressions + +```ruby +# Union — combine multiple paths +doc.find('//title | //author') + +# Boolean operators in predicates +doc.find('//book[@year > 2000 and @lang = "en"]') +``` + +For the full XPath 1.0 specification, see the [W3C XPath Reference](https://www.w3.org/TR/xpath-10/). + +## Practical Examples + +### Extract Data from an RSS Feed + +```ruby +doc = XML::Parser.file('feed.xml').parse + +doc.find('//item').each do |item| + title = item.find_first('title').content + link = item.find_first('link').content + puts "#{title}: #{link}" +end +``` + +### Find Elements by Attribute Value + +```ruby +# All books published after 2020 +doc.find('//book[@year > 2020]').each do |book| + puts book.find_first('title').content +end + +# Elements with a specific class +doc.find('//*[@class="highlight"]') +``` + +### Conditional Extraction + +```ruby +# Books with a price, sorted extraction +doc.find('//book[price]').each do |book| + title = book.find_first('title').content + price = book.find_first('price').content.to_f + puts "#{title}: $#{'%.2f' % price}" if price > 20 +end +``` + +### Count and Aggregate + +```ruby +# Count elements +total = doc.find('count(//book)') # => Float + +# Get text content directly +first_title = doc.find('string(//book[1]/title)') # => String +``` + +### Find First Match + +```ruby +# find_first is a convenience for find(...).first +node = doc.find_first('//book[@id="42"]') +if node + puts node.find_first('title').content +end +``` + +### Navigate Relative to a Node + +```ruby +chapter = doc.find_first('//chapter[@id="3"]') + +# All sections within this chapter +chapter.find('section').each do |section| + puts section['title'] +end + +# Paragraphs anywhere under this chapter +chapter.find('.//p').each do |p| + puts p.content +end +``` diff --git a/zensical.toml b/zensical.toml new file mode 100644 index 00000000..ed64ebe4 --- /dev/null +++ b/zensical.toml @@ -0,0 +1,97 @@ +[project] + +site_name = "libxml-ruby" +site_description = "Ruby language bindings for the GNOME Libxml2 XML toolkit" +site_url = "https://xml4r.github.io/libxml-ruby/" + +repo_url = "https://github.com/xml4r/libxml-ruby" +repo_name = "xml4r/libxml-ruby" + +nav = [ + { "Home" = [ + "index.md", + { "Installation" = "installation.md" }, + { "Getting Started" = "getting_started.md" }, + { "Changelog" = "changelog.md" }, + ]}, + { "Parsing" = [ + { "Overview" = "parsing/overview.md" }, + { "DOM Parser" = "parsing/dom.md" }, + { "Reader" = "parsing/reader.md" }, + { "SAX Parser" = "parsing/sax.md" }, + { "HTML Parser" = "parsing/html.md" }, + ]}, + { "Working with XML" = [ + { "Documents" = "xml/documents.md" }, + { "Nodes" = "xml/nodes.md" }, + { "Attributes" = "xml/attributes.md" }, + { "Namespaces" = "xml/namespaces.md" }, + { "Writer" = "xml/writer.md" }, + { "Encoding" = "xml/encoding.md" }, + { "Error Handling" = "xml/errors.md" }, + ]}, + { "XPath" = [ + { "Overview" = "xpath/overview.md" }, + { "Namespaces" = "xpath/namespaces.md" }, + { "Context" = "xpath/context.md" }, + ]}, + { "Validation" = [ + { "DTD" = "validation/dtd.md" }, + { "RelaxNG" = "validation/relaxng.md" }, + { "XML Schema" = "validation/schema.md" }, + ]}, + { "Architecture" = [ + { "Memory Management" = "architecture/memory.md" }, + { "Pointer Registry" = "architecture/registry.md" }, + ]}, + { "API Reference" = "reference/" }, +] + +[project.theme] + +variant = "classic" + +features = [ + "navigation.tabs", + "navigation.tabs.sticky", + "navigation.top", + "search.highlight", + "content.code.copy", + "toc.integrate", +] + +# Palette toggle for light mode +[[project.theme.palette]] +scheme = "default" +primary = "deep-purple" +accent = "deep-purple" +toggle.icon = "lucide/sun" +toggle.name = "Switch to dark mode" + +# Palette toggle for dark mode +[[project.theme.palette]] +scheme = "slate" +primary = "deep-purple" +accent = "deep-purple" +toggle.icon = "lucide/moon" +toggle.name = "Switch to light mode" + +# Extensions +[project.markdown_extensions.admonition] + +[project.markdown_extensions.pymdownx.details] + +[project.markdown_extensions.pymdownx.superfences] +custom_fences = [ + { name = "mermaid", class = "mermaid", format = "pymdownx.superfences.fence_code_format" }, +] + +[project.markdown_extensions.pymdownx.highlight] +anchor_linenums = true + +[project.markdown_extensions.pymdownx.inlinehilite] + +[project.markdown_extensions.tables] + +[project.markdown_extensions.toc] +permalink = true