User:Aarchiba/SVG sanitizer

From Wikipedia, the free encyclopedia

Apparently Wikipedia does not host SVG files for fear that they will contain trojans. It is certainly true that SVG can contain JavaScript. If SVG viewers run this JavaScript in a trusted environment, then it might indeed be a security hole. If that's a problem, then the simplest solution is to just rip it right out. Here's a script to remove all <script> tags and their contents. (Other tags are not executed, according to the SVG standard (as far as I can tell)) and so can remain.

This program reads its standard input, parses it as XML, removes any script tags and anything beneath them in the DOM tree, as well as any event attributes, and then writes an equivalent XML file to its standard output. This code does not validate against the DTD, but badly-formed XML simply causes the program to throw an exception and exit, producing no output. The XML is written in whatever character encoding is specified by the XML itself; this could easily be changed to force UTF-8. It returns a nonzero exit status if any scripts were detected.

It handles tags from other namespaces by verifying that they asre from one of a short list of namespaces; currently the only namespace from which tags are reliably removed or modified is the original SVG namespace.

This script successfully processes essentially all the non-broken files in the openclipart 0.11 release.

import sys
import xml.dom
import xml.dom.minidom
import re

# Sanitize SVG by removing any script calls of any sort.
# Returns a non-zero exit value if any changes were made.
#
# WARNING:
# * Does not validate the SVG against a DTD (or schema or whatever)
# * Pieces of non-SVG XML are mostly not sanitized, but must come from a short list of namespaces.
# * Reformats even documents that need no changes (but leaves the XML semantically identical).
#


class Namespace:
        def __init__(self, name):
                self.name = name


# SVG itself
svg = Namespace("http://www.w3.org/2000/svg")
# This is the complete list of event attributes from http://www.w3.org/TR/SVG/interact.html#SVGEvents
svg.event_attributes = [
        "onfocusin",
        "onfocusout",
        "onactivate",
        "onclick",
        "onmousedown",
        "onmouseup",
        "onmouseover",
        "onmousemove",
        "onmouseout",
        "onload",
        "onunload",
        "onabort",
        "onerror",
        "onresize",
        "onscroll",
        "onzoom",
        "onbegin",
        "onend",
        "onrepeat",
        ]

# From http://www.w3.org/TR/SVG/script.html
svg.script_attributes = [
        "contentScriptType",
        ]
svg.script_tags = [
        "script",
        ]

svg.evil_attributes = svg.script_attributes + svg.event_attributes
svg.evil_tags = svg.script_tags



svgns = [
        "http://www.w3.org/2000/svg",
        ]
adobens = [
        "http://ns.adobe.com/Extensibility/1.0/",
        "http://ns.adobe.com/Flows/1.0/",
        "http://ns.adobe.com/AdobeIllustrator/10.0/",
        "http://ns.adobe.com/AdobeSVGViewerExtensions/3.0/",
        ]
metans = [
        "http://web.resource.org/cc/",
        "http://purl.org/dc/elements/1.1/",
        "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
        "http://www.w3.org/2000/xmlns/",
        "http://www.w3.org/XML/1998/namespace",
        "http://www.w3.org/1999/xlink",
        ]
inkns = [
        "http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd",
        "http://inkscape.sourceforge.net/DTD/sodipodi-0.dtd",
        "http://www.inkscape.org/namespaces/inkscape",
        ]
msns = [
        "http://schemas.microsoft.com/visio/2003/SVGExtensions/",
        ]



acceptable_namespaces = [None] + svgns + adobens + metans + inkns + msns

namespaces = {}
for a in acceptable_namespaces:
        namespaces[a] = None

# Some namespaces get sanitized as if they were SVG
special_namespaces = { None:svg }
for ns in svgns + adobens + inkns + msns:
        special_namespaces[ns] = svg




def message(s):
        sys.stderr.write(s)
        sys.stderr.write("\n")
        sys.stderr.flush()




def element_is_acceptable(node):
        global adobe_extensions
        global ink_extensions
        if node.namespaceURI in adobens: adobe_extensions = True
        if node.namespaceURI in inkns: ink_extensions = True
        if node.namespaceURI not in namespaces:
                message("Namespace '%s'not found; element '%s' unacceptable." % (node.namespaceURI,node))
                return False
        if node.namespaceURI in special_namespaces:
                if node.localName in special_namespaces[node.namespaceURI].evil_tags:
                        message("Element '%s' unacceptable." % node)
                        return False
        return True

def attribute_is_acceptable(node, attribute):
        nsURI = attribute.namespaceURI or node.namespaceURI
        if nsURI in adobens: adobe_extensions = True
        if nsURI in inkns: ink_extensions = True
        if not nsURI in namespaces:
                message("Namespace '%s'not found; attribute '%s' unacceptable." % (attribute.namespaceURI or node.namespaceURI,node))
                return False
        if nsURI in special_namespaces and attribute.localName in special_namespaces[nsURI].evil_attributes:
                message("Attribute '%s' unacceptable." % attribute)
                return False
        return True




# Begin cleansing
changes = False

doc = xml.dom.minidom.parse(sys.stdin)

# Accept all versions of SVG
if doc.doctype:
        if doc.doctype.name<>"svg" or not re.match(r"-//W3C//DTD SVG [0-9.]+//.*",doc.doctype.publicId):
                raise ValueError, 'Document does not appear to be SVG; doctype is "%s"' % doc.doctype.publicId
else:
        # No doctype definition; accept as SVG anyway
        if not doc.documentElement.namespaceURI in [None,svg.name] or doc.documentElement.localName<>"svg":
                raise ValueError, 'Document does not appear to be SVG; no doctype and root tag is "%s" in namespace "%s".' % (doc.documentElement, doc.documentElement.namespaceURI)

# Generic DOM function
def walk_tree(node):
        yield node
        for n in node.childNodes:
                for t in walk_tree(n):
                        yield t


adobe_extensions = False
ink_extensions = False


for node in walk_tree(doc):
        # Eradicate anything from other namespaces
        if not element_is_acceptable(node):
                changes=True
                node.parentNode.removeChild(node)

        # Eradicate evil attributes
        if node.attributes is not None:
                for attr in map(lambda x: node.attributes.item(x), range(node.attributes.length)):
                        if not attribute_is_acceptable(node,attr):
                                node.removeAttributeNode(attr)
                                changes = True

#if adobe_extensions: message("File contains Adobe extensions to SVG.")
#if ink_extensions: message("File contains Inkscape/Sodipodi extensions to SVG.")
sys.stdout.write(doc.toxml("utf-8"))

print #newline at end of file

if changes:
        sys.exit(1)
else:
        sys.exit(0)

All this software requires is a working installation of python 2.3.