diff --git a/docs/asciidoc/modules/ROOT/nav.adoc b/docs/asciidoc/modules/ROOT/nav.adoc index ef3ac084a5..5fe025da14 100644 --- a/docs/asciidoc/modules/ROOT/nav.adoc +++ b/docs/asciidoc/modules/ROOT/nav.adoc @@ -26,6 +26,8 @@ include::partial$generated-documentation/nav.adoc[] ** xref::import/load-csv.adoc[] ** xref::import/xls.adoc[] ** xref::import/html.adoc[] + ** xref::import/parquet.adoc[] + ** xref::import/gexf.adoc[] * xref:export/index.adoc[] ** xref::export/xls.adoc[] diff --git a/docs/asciidoc/modules/ROOT/pages/import/gexf.adoc b/docs/asciidoc/modules/ROOT/pages/import/gexf.adoc new file mode 100644 index 0000000000..ec06ff1c6a --- /dev/null +++ b/docs/asciidoc/modules/ROOT/pages/import/gexf.adoc @@ -0,0 +1,222 @@ +[[gexf]] += Load GEXF (Graph Exchange XML Format) +:description: This section describes procedures that can be used to import data from GEXF files. + + + +Many existing applications and data integrations use GEXF to describes a graph with nodes and edges. +For further information, you should visit the https://gexf.net/[official documentation]. + +It is possible to load or import nodes and relationship from a GEXF file with the procedures + `apoc.load.gexf` and `apoc.import.gexf`. You need to: + +* provide a path to a GEXF file +* provide configuration (optional) + +The `apoc.import.gexf` read as the `apoc.load.gexf` but also create nodes and relationships in Neo4j. + +For reading from files you'll have to enable the config option: + +---- +apoc.import.file.enabled=true +---- + +By default file paths are global, for paths relative to the `import` directory set: + +---- +apoc.import.file.use_neo4j_config=true +---- + +== Examples for apoc.load.gexf + +.load.gexf +---- + + + + + + + + + + + + +---- + +[source, cypher] +---- +CALL apoc.load.gexf('load.gexf') +---- + +.Results +[opts="header"] +|=== +| value +| {_type: gexf, _children: [{_type: graph, defaultedgetype: directed, _children: [{_type: nodes, _children: [{_type: node, _children: [{_type: attvalues, _children: [{_type: attvalue, for: 0, value: http://gephi.org}]}], foo: bar}]}]}], version: 1.2} +|=== + +== Examples for apoc.import.gexf + +Besides the file you can pass in a config map: + +.Config parameters +[opts=header] +|=== +| name | type | default | description +| readLabels | Boolean | false | Creates node labels based on the value in the `labels` property of `node` elements +| defaultRelationshipType | String | RELATED | The default relationship type to use if none is specified in the GraphML file +| storeNodeIds | Boolean | false | store the `id` property of `node` elements +| batchSize | Integer | 20000 | The number of elements to process per transaction +| compression | `Enum[NONE, BYTES, GZIP, BZIP2, DEFLATE, BLOCK_LZ4, FRAMED_SNAPPY]` | `null` | Allow taking binary data, either not compressed (value: `NONE`) or compressed (other values) +| source | Map | Empty map | See `source / target config` parameter below +| target | Map | Empty map | See `source / target config` parameter below +See the xref::overview/apoc.load/apoc.load.csv.adoc#_binary_file[Binary file example] +|=== + + +With the following file will be created: + +* 1 node with label Gephi +* 2 nodes with label Webatlas +* 1 node with label RTGI +* 1 node with label BarabasiLab +* 6 relationships of kind KNOWS +* 1 relationship of kind HAS_TICKET +* 1 relationship of kind BAZ + +.data.gexf +---- + + + + Gephi.org + A Web network + + + + + + + + + + + + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +---- + +[source, cypher] +---- +CALL apoc.import.gexf('data.gexf', {readLabels:true}) +---- + +.Results +[opts="header"] +|=== +| value +| { +"relationships" : 8, +"batches" : 0, +"file" : "file:/../data.gexf", +"nodes" : 5, +"format" : "gexf", +"source" : "file", +"time" : 9736, +"rows" : 0, +"batchSize" : -1, +"done" : true, +"properties" : 21 +} +|=== + +We can also store the node IDs by executing: +[source, cypher] +---- +CALL apoc.import.gexf('data.gexf', {readLabels:true, storeNodeIds: true}) +---- + +=== source / target config + +Allows the import of relations in case the source and / or target nodes are not present in the file, searching for nodes via a custom label and property. +To do this, we can insert into the config map `source: {label: '', id: `''`}` and/or `source: {label: '', id: `''`}` +In this way, we can search start and end nodes via the source and end attribute of `edge` tag. + +For example, with a config map `{source: {id: 'myId', label: 'Foo'}, target: {id: 'other', label: 'Bar'}}` +with a edge row like `KNOWS` +we search a source node `(:Foo {myId: 'n0'})` and an end node `(:Bar {other: 'n1'})`. +The id key is optional (the default is `'id'`). + + + + diff --git a/docs/asciidoc/modules/ROOT/pages/import/index.adoc b/docs/asciidoc/modules/ROOT/pages/import/index.adoc index 8b645759e4..732f3dbd39 100644 --- a/docs/asciidoc/modules/ROOT/pages/import/index.adoc +++ b/docs/asciidoc/modules/ROOT/pages/import/index.adoc @@ -13,3 +13,4 @@ For more information on these procedures, see: * xref::import/xls.adoc[] * xref::import/html.adoc[] * xref::import/parquet.adoc[] +* xref::import/gexf.adoc[] diff --git a/extended/src/main/java/apoc/load/Gexf.java b/extended/src/main/java/apoc/load/Gexf.java new file mode 100644 index 0000000000..b7e75571e2 --- /dev/null +++ b/extended/src/main/java/apoc/load/Gexf.java @@ -0,0 +1,83 @@ +package apoc.load; + +import apoc.Extended; +import apoc.Pools; +import apoc.export.util.CountingReader; +import apoc.export.util.ExportConfig; +import apoc.export.util.ProgressReporter; +import apoc.load.util.XmlReadUtil.Import; +import apoc.result.MapResult; +import apoc.result.ProgressInfo; +import apoc.util.FileUtils; +import apoc.util.Util; +import org.neo4j.graphdb.GraphDatabaseService; +import org.neo4j.graphdb.security.URLAccessChecker; +import org.neo4j.procedure.Context; +import org.neo4j.procedure.Description; +import org.neo4j.procedure.Mode; +import org.neo4j.procedure.Name; +import org.neo4j.procedure.Procedure; +import org.neo4j.procedure.TerminationGuard; + +import java.util.Map; +import java.util.stream.Stream; + +import static apoc.load.util.XmlReadUtil.Load.xmlXpathToMapResult; + +@Extended +public class Gexf { + + @Context + public GraphDatabaseService db; + + @Context + public URLAccessChecker urlAccessChecker; + + @Context + public TerminationGuard terminationGuard; + + @Context + public Pools pools; + + @Procedure("apoc.load.gexf") + @Description("apoc.load.gexf(urlOrBinary, path, $config) - load Gexf file from URL or binary source") + public Stream gexf( + @Name("urlOrBinary") Object urlOrBinary, + @Name(value = "config", defaultValue = "{}") Map config + ) throws Exception { + return xmlXpathToMapResult(urlOrBinary, urlAccessChecker, terminationGuard, config); + } + + @Procedure(name = "apoc.import.gexf", mode = Mode.WRITE) + @Description("Imports a graph from the provided GraphML file.") + public Stream importGexf( + @Name("urlOrBinaryFile") Object urlOrBinaryFile, @Name("config") Map config) { + ProgressInfo result = Util.inThread(pools, () -> { + ExportConfig exportConfig = new ExportConfig(config); + String file = null; + String source = "binary"; + if (urlOrBinaryFile instanceof String) { + file = (String) urlOrBinaryFile; + source = "file"; + } + ProgressReporter reporter = new ProgressReporter(null, null, new ProgressInfo(file, source, "gexf")); + Import graphReader = new Import(db) + .reporter(reporter) + .batchSize(exportConfig.getBatchSize()) + .relType(exportConfig.defaultRelationshipType()) + .source(exportConfig.getSource()) + .target(exportConfig.getTarget()) + .nodeLabels(exportConfig.readLabels()); + + if (exportConfig.storeNodeIds()) graphReader.storeNodeIds(); + + try (CountingReader reader = + FileUtils.readerFor(urlOrBinaryFile, exportConfig.getCompressionAlgo(), urlAccessChecker)) { + graphReader.parseXML(reader, terminationGuard); + } + + return reporter.getTotal(); + }); + return Stream.of(result); + } +} diff --git a/extended/src/main/java/apoc/load/util/XmlReadUtil.java b/extended/src/main/java/apoc/load/util/XmlReadUtil.java new file mode 100644 index 0000000000..9179d7e9e9 --- /dev/null +++ b/extended/src/main/java/apoc/load/util/XmlReadUtil.java @@ -0,0 +1,726 @@ +package apoc.load.util; + +import apoc.export.util.BatchTransaction; +import apoc.export.util.CountingInputStream; +import apoc.export.util.ExportConfig; +import apoc.export.util.Reporter; +import apoc.result.MapResult; +import apoc.util.CompressionAlgo; +import apoc.util.FileUtils; +import apoc.util.JsonUtil; +import apoc.util.Util; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.text.WordUtils; +import org.neo4j.graphdb.Entity; +import org.neo4j.graphdb.GraphDatabaseService; +import org.neo4j.graphdb.Label; +import org.neo4j.graphdb.Node; +import org.neo4j.graphdb.Relationship; +import org.neo4j.graphdb.RelationshipType; +import org.neo4j.graphdb.Transaction; +import org.neo4j.graphdb.security.URLAccessChecker; +import org.neo4j.procedure.TerminationGuard; +import org.w3c.dom.CharacterData; +import org.w3c.dom.Document; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.NodeList; +import org.xml.sax.InputSource; +import org.xml.sax.SAXParseException; + +import javax.xml.namespace.QName; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.stream.XMLEventReader; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamConstants; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.events.Attribute; +import javax.xml.stream.events.StartElement; +import javax.xml.stream.events.XMLEvent; +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathFactory; +import java.io.FileNotFoundException; +import java.io.InputStream; +import java.io.Reader; +import java.io.StringReader; +import java.lang.reflect.Array; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Deque; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.function.Function; +import java.util.stream.Stream; + +import static apoc.load.util.XmlReadUtil.Load.generateXmlDoctypeException; +import static apoc.util.CompressionConfig.COMPRESSION; +import static apoc.util.ExtendedUtil.toValidValue; + +/** + * Taken from Xml + * placed in APOC Core + */ +public class XmlReadUtil { + + public static class Load { + public static Stream xmlXpathToMapResult( + Object urlOrBinary, URLAccessChecker urlAccessChecker, TerminationGuard terminationGuard, Map config) throws Exception { + if (config == null) config = Collections.emptyMap(); + boolean failOnError = (boolean) config.getOrDefault("failOnError", true); + String path = (String) config.getOrDefault("path", "/"); + boolean simpleMode = Util.toBoolean(config.getOrDefault("simpleMode", false)); + try { + Map headers = (Map) config.getOrDefault("headers", Collections.emptyMap()); + CountingInputStream is = FileUtils.inputStreamFor( + urlOrBinary, + headers, + null, + (String) config.getOrDefault(COMPRESSION, CompressionAlgo.NONE.name()), + urlAccessChecker); + return parse(is, simpleMode, path, failOnError, terminationGuard); + } catch (Exception e) { + if (!failOnError) return Stream.of(new MapResult(Collections.emptyMap())); + else throw e; + } + } + + private static Stream parse(InputStream data, boolean simpleMode, String path, boolean failOnError, TerminationGuard terminationGuard) + throws Exception { + List result = new ArrayList<>(); + try { + DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance(); + documentBuilderFactory.setNamespaceAware(true); + documentBuilderFactory.setIgnoringElementContentWhitespace(true); + documentBuilderFactory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder(); + documentBuilder.setEntityResolver((publicId, systemId) -> new InputSource(new StringReader(""))); + + Document doc = documentBuilder.parse(data); + XPathFactory xPathFactory = XPathFactory.newInstance(); + + XPath xPath = xPathFactory.newXPath(); + + path = StringUtils.isEmpty(path) ? "/" : path; + XPathExpression xPathExpression = xPath.compile(path); + NodeList nodeList = (NodeList) xPathExpression.evaluate(doc, XPathConstants.NODESET); + + for (int i = 0; i < nodeList.getLength(); i++) { + final Deque> stack = new LinkedList<>(); + handleNode(stack, nodeList.item(i), simpleMode, terminationGuard); + for (int index = 0; index < stack.size(); index++) { + result.add(new MapResult(stack.pollFirst())); + } + } + } catch (FileNotFoundException e) { + if (!failOnError) return Stream.of(new MapResult(Collections.emptyMap())); + else throw e; + } catch (Exception e) { + if (!failOnError) return Stream.of(new MapResult(Collections.emptyMap())); + else if (e instanceof SAXParseException && e.getMessage().contains("DOCTYPE is disallowed")) + throw generateXmlDoctypeException(); + else throw e; + } + return result.stream(); + } + + /** + * Collects type and attributes for the node + * + * @param node + * @param elementMap + */ + private static void handleTypeAndAttributes(org.w3c.dom.Node node, Map elementMap) { + // Set type + if (node.getLocalName() != null) { + elementMap.put("_type", node.getLocalName()); + } + + // Set the attributes + if (node.getAttributes() != null) { + NamedNodeMap attributeMap = node.getAttributes(); + for (int i = 0; i < attributeMap.getLength(); i++) { + org.w3c.dom.Node attribute = attributeMap.item(i); + elementMap.put(attribute.getNodeName(), attribute.getNodeValue()); + } + } + } + + private static void handleNode(Deque> stack, org.w3c.dom.Node node, boolean simpleMode, TerminationGuard terminationGuard) { + terminationGuard.check(); + + // Handle document node + if (node.getNodeType() == org.w3c.dom.Node.DOCUMENT_NODE) { + NodeList children = node.getChildNodes(); + for (int i = 0; i < children.getLength(); i++) { + if (children.item(i).getLocalName() != null) { + handleNode(stack, children.item(i), simpleMode, terminationGuard); + return; + } + } + } + + Map elementMap = new LinkedHashMap<>(); + handleTypeAndAttributes(node, elementMap); + + // Set children + NodeList children = node.getChildNodes(); + int count = 0; + for (int i = 0; i < children.getLength(); i++) { + org.w3c.dom.Node child = children.item(i); + + // This is to deal with text between xml tags for example new line characters + if (child.getNodeType() != org.w3c.dom.Node.TEXT_NODE && child.getNodeType() != org.w3c.dom.Node.CDATA_SECTION_NODE) { + handleNode(stack, child, simpleMode, terminationGuard); + count++; + } else { + // Deal with text nodes + handleTextNode(child, elementMap); + } + } + + if (children.getLength() > 0) { + if (!stack.isEmpty()) { + List nodeChildren = new ArrayList<>(); + for (int i = 0; i < count; i++) { + nodeChildren.add(stack.pollLast()); + } + String key = simpleMode ? "_" + node.getLocalName() : "_children"; + Collections.reverse(nodeChildren); + if (nodeChildren.size() > 0) { + // Before adding the children we need to handle mixed text + Object text = elementMap.get("_text"); + if (text instanceof List) { + for (Object element : (List) text) { + nodeChildren.add(element); + } + elementMap.remove("_text"); + } + + elementMap.put(key, nodeChildren); + } + } + } + + if (!elementMap.isEmpty()) { + stack.addLast(elementMap); + } + } + + /** + * Handle TEXT nodes and CDATA nodes + * + * @param node + * @param elementMap + */ + private static void handleTextNode(org.w3c.dom.Node node, Map elementMap) { + Object text = ""; + int nodeType = node.getNodeType(); + switch (nodeType) { + case org.w3c.dom.Node.TEXT_NODE: + text = normalizeText(node.getNodeValue()); + break; + case org.w3c.dom.Node.CDATA_SECTION_NODE: + text = normalizeText(((CharacterData) node).getData()); + break; + default: + break; + } + + // If the text is valid ... + if (!StringUtils.isEmpty(text.toString())) { + // We check if we have already collected some text previously + Object previousText = elementMap.get("_text"); + if (previousText != null) { + // If we just have a "_text" key than we need to collect to a List + text = Arrays.asList(previousText.toString(), text); + } + elementMap.put("_text", text); + } + } + + /** + * Remove trailing whitespaces and new line characters + * + * @param text + * @return + */ + private static String normalizeText(String text) { + String[] tokens = StringUtils.split(text, "\n"); + for (int i = 0; i < tokens.length; i++) { + tokens[i] = tokens[i].trim(); + } + + return StringUtils.join(tokens, " ").trim(); + } + + public static RuntimeException generateXmlDoctypeException() { + throw new RuntimeException("XML documents with a DOCTYPE are not allowed."); + } + } + + + /** + * Taken from GraphMLReader + * placed in APOC Core + */ + public static class Import { + + public static final String LABEL_SPLIT = " *: *"; + private final GraphDatabaseService db; + private boolean storeNodeIds; + private RelationshipType defaultRelType = RelationshipType.withName("UNKNOWN"); + private ExportConfig.NodeConfig source; + private ExportConfig.NodeConfig target; + private int batchSize = 40000; + private Reporter reporter; + private boolean labels; + + public Import storeNodeIds() { + this.storeNodeIds = true; + return this; + } + + public Import relType(String name) { + this.defaultRelType = RelationshipType.withName(name); + return this; + } + + public Import batchSize(int batchSize) { + this.batchSize = batchSize; + return this; + } + + public Import nodeLabels(boolean readLabels) { + this.labels = readLabels; + return this; + } + + public Import source(ExportConfig.NodeConfig sourceConfig) { + this.source = sourceConfig; + return this; + } + + public Import target(ExportConfig.NodeConfig targetConfig) { + this.target = targetConfig; + return this; + } + + public Import reporter(Reporter reporter) { + this.reporter = reporter; + return this; + } + + public ExportConfig.NodeConfig getSource() { + return source; + } + + public ExportConfig.NodeConfig getTarget() { + return target; + } + + enum Type { + BOOLEAN() { + Object parse(String value) { + return Boolean.valueOf(value); + } + + Object parseList(String value) { + return Type.parseList(value, Boolean.class, (i) -> (Boolean) i); + } + }, + INT() { + Object parse(String value) { + return Integer.parseInt(value); + } + + Object parseList(String value) { + return Type.parseList(value, Integer.class, (n) -> ((Number) n).intValue()); + } + }, + LONG() { + Object parse(String value) { + return Long.parseLong(value); + } + + Object parseList(String value) { + return Type.parseList(value, Long.class, (i) -> ((Number) i).longValue()); + } + }, + FLOAT() { + Object parse(String value) { + return Float.parseFloat(value); + } + + Object parseList(String value) { + return Type.parseList(value, Float.class, (i) -> ((Number) i).floatValue()); + } + }, + DOUBLE() { + Object parse(String value) { + return Double.parseDouble(value); + } + + Object parseList(String value) { + return Type.parseList(value, Double.class, (i) -> ((Number) i).doubleValue()); + } + }, + STRING() { + Object parse(String value) { + return value; + } + + Object parseList(String value) { + return Type.parseList(value, String.class, (i) -> (String) i); + } + }; + + abstract Object parse(String value); + + abstract Object parseList(String value); + + public static T[] parseList(String value, Class asClass, Function convert) { + List parsed = JsonUtil.parse(value, null, List.class); + T[] converted = (T[]) Array.newInstance(asClass, parsed.size()); + + for (int i = 0; i < parsed.size(); i++) converted[i] = convert.apply(parsed.get(i)); + return converted; + } + + public static Type forType(String type) { + if (type == null) return STRING; + return valueOf(type.trim().toUpperCase()); + } + } + + static class Key { + String nameOrId; + boolean forNode; + Type listType; + Type type; + Object defaultValue; + + public Key(String nameOrId, String type, String listType, String forNode) { + this.nameOrId = nameOrId; + this.type = Type.forType(type); + if (listType != null) { + this.listType = Type.forType(listType); + } + this.forNode = forNode == null || forNode.equalsIgnoreCase("node"); + } + + private static Key defaultKey(String id, boolean forNode) { + return new Key(id, "string", null, forNode ? "node" : "edge"); + } + + public void setDefault(String data) { + this.defaultValue = type.parse(data); + } + + public Object parseValue(String input) { + if (input == null || input.trim().isEmpty()) return defaultValue; + if (listType != null) return listType.parseList(input); + return type.parse(input); + } + } + + public static final QName ID = QName.valueOf("id"); + public static final QName LABELS = QName.valueOf("labels"); + public static final QName LABEL = QName.valueOf("label"); + public static final QName VALUE = QName.valueOf("value"); + public static final QName FOR = QName.valueOf("for"); + public static final QName NAME = QName.valueOf("attr.name"); + public static final QName TYPE = QName.valueOf("attr.type"); + public static final QName DATA_TYPE = QName.valueOf("type"); + public static final QName LIST = QName.valueOf("attr.list"); + public static final QName KEY = QName.valueOf("key"); + public static final QName KIND = QName.valueOf("kind"); + + public Import(GraphDatabaseService db) { + this.db = db; + } + + public long parseXML(Reader input, TerminationGuard terminationGuard) throws XMLStreamException { + Map dataMap = new HashMap<>(); + Map cache = new HashMap<>(1024 * 32); + XMLInputFactory inputFactory = XMLInputFactory.newInstance(); + inputFactory.setProperty("javax.xml.stream.isCoalescing", true); + inputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, true); + inputFactory.setProperty(XMLInputFactory.SUPPORT_DTD, false); + inputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false); + XMLEventReader reader = inputFactory.createXMLEventReader(input); + Entity last = null; + Map nodeKeys = new HashMap<>(); + Map relKeys = new HashMap<>(); + int count = 0; + BatchTransaction tx = new BatchTransaction(db, batchSize * 10, reporter); + try { + + while (reader.hasNext()) { + terminationGuard.check(); + XMLEvent event; + try { + event = (XMLEvent) reader.next(); + if (event.getEventType() == XMLStreamConstants.DTD) { + generateXmlDoctypeException(); + } + } catch (Exception e) { + // in case of unicode invalid chars we skip the event, or we exit in case of EOF + if (e.getMessage().contains("Unexpected EOF")) { + break; + } else if (e.getMessage().contains("DOCTYPE")) { + throw e; + } + continue; + } + if (event.isStartElement()) { + + StartElement element = event.asStartElement(); + String name = element.getName().getLocalPart(); + + if (name.equals("graphml") || name.equals("graph") || name.equals("gexf")) continue; + if (name.equals("attribute")) { + String id = getAttribute(element, ID); + String type = getAttribute(element, DATA_TYPE); + dataMap.put(id, type); + } + if (name.equals("key")) { + String id = getAttribute(element, ID); + Key key = new Key( + getAttribute(element, NAME), + getAttribute(element, TYPE), + getAttribute(element, LIST), + getAttribute(element, FOR)); + + XMLEvent next = peek(reader); + if (next.isStartElement() + && next.asStartElement() + .getName() + .getLocalPart() + .equals("default")) { + reader.nextEvent().asStartElement(); + key.setDefault(reader.nextEvent().asCharacters().getData()); + } + if (key.forNode) nodeKeys.put(id, key); + else relKeys.put(id, key); + continue; + } + if (name.equals("attvalue")) { // Changed from data to attvalue for node properties in gexf + if (last == null) continue; + String id = getAttribute(element, FOR); + boolean isNode = last instanceof Node; + Key key = isNode ? nodeKeys.get(id) : relKeys.get(id); + if (key == null) key = Key.defaultKey(id, isNode); + final Map.Entry eventEntry = getDataEventEntry(reader, key); + final XMLEvent next = eventEntry.getKey(); + final Object value = getAttribute(element, VALUE); + if (value != null) { + if (this.labels && isNode && id.equals("labels")) { + addLabels((Node) last, value.toString()); + } else if (!this.labels || isNode || !id.equals("label")) { + Object convertedValue = toValidValue(value, key.nameOrId, dataMap); + last.setProperty(key.nameOrId, convertedValue); + if (reporter != null) reporter.update(0, 0, 1); + } + } else if (next.getEventType() == XMLStreamConstants.END_ELEMENT) { + last.setProperty(key.nameOrId, StringUtils.EMPTY); + reporter.update(0, 0, 1); + } + continue; + } + if (name.equals("node")) { + tx.increment(); + String id = getAttribute(element, ID); + Node node = tx.getTransaction().createNode(); + if (this.labels) { + String labels = getAttribute(element, LABEL); // Changed from labels to label to fit gexf property format + addLabels(node, labels); + } + if (storeNodeIds) node.setProperty("id", id); + setDefaults(nodeKeys, node); + last = node; + cache.put(id, node.getElementId()); + if (reporter != null) reporter.update(1, 0, 0); + count++; + continue; + } + if (name.equals("edge")) { + tx.increment(); + String label = getAttribute(element, KIND); // changed from label to kind for gexf + Node from = getByNodeId(cache, tx.getTransaction(), element, NodeExport.NodeType.SOURCE); + Node to = getByNodeId(cache, tx.getTransaction(), element, NodeExport.NodeType.TARGET); + + RelationshipType relationshipType = + label == null ? getRelationshipType(reader) : RelationshipType.withName(label); + Relationship relationship = from.createRelationshipTo(to, relationshipType); + setDefaults(relKeys, relationship); + last = relationship; + if (reporter != null) reporter.update(0, 1, 0); + count++; + } + } + } + tx.doCommit(); + } catch (Exception e) { + tx.rollback(); + throw e; + } finally { + tx.close(); + reader.close(); + } + return count; + } + + private Map.Entry getDataEventEntry(XMLEventReader reader, Key key) { + Object value = key.defaultValue; + + final Map.Entry peekEntry = peekRecursively(reader, null); + if (peekEntry.getValue() != null) { + value = key.parseValue(peekEntry.getValue()); + } + return new AbstractMap.SimpleEntry<>(peekEntry.getKey(), value); + } + + private Map.Entry peekRecursively(XMLEventReader reader, String data) { + try { + final XMLEvent peek = peek(reader); + // in case of char, we concat the result to the current value and redo the peek + // in order to obtain e.g. from a string "abcdef" --> "abcdef" + if (peek.isCharacters()) { + data = StringUtils.join(data, reader.nextEvent().asCharacters().getData()); + return peekRecursively(reader, data); + } + // in case the event is not a char we continue setting labels/properties + return new AbstractMap.SimpleEntry<>(peek, data); + } catch (Exception e) { + // in case of unicode invalid chars we continue until we get a valid event + return peekRecursively(reader, data); + } + } + + private Node getByNodeId( + Map cache, Transaction tx, StartElement element, NodeExport.NodeType nodeType) { + final NodeExport xmlNodeInterface = nodeType.get(); + final ExportConfig.NodeConfig nodeConfig = xmlNodeInterface.getNodeConfigReader(this); + + final String sourceTargetValue = getAttribute(element, QName.valueOf(nodeType.getName())); + + final String id = cache.get(sourceTargetValue); + // without source/target config, we look for the internal id + if (StringUtils.isBlank(nodeConfig.label)) { + return tx.getNodeByElementId(id); + } + // with source/target configured, we search a node with a specified label + // and with a type specified in sourceType, if present, or string by default + final String attribute = getAttribute(element, QName.valueOf(nodeType.getNameType())); + final Object value = + attribute == null ? sourceTargetValue : Type.forType(attribute).parse(sourceTargetValue); + + return tx.findNode( + Label.label(nodeConfig.label), + Optional.ofNullable(nodeConfig.id).orElse("id"), + value); + } + + private RelationshipType getRelationshipType(XMLEventReader reader) throws XMLStreamException { + if (this.labels) { + XMLEvent peek = reader.peek(); + boolean isChar = peek.isCharacters(); + if (isChar && !(peek.asCharacters().isWhiteSpace())) { + String value = peek.asCharacters().getData(); + String el = ":"; + String typeRel = value.contains(el) ? value.replace(el, StringUtils.EMPTY) : value; + return RelationshipType.withName(typeRel.trim()); + } + + boolean notStartElementOrContainsKeyLabel = isChar || !peek.isStartElement() || containsLabelKey(peek); + + if (!peek.isEndDocument() && notStartElementOrContainsKeyLabel) { + reader.nextEvent(); + return getRelationshipType(reader); + } + } + reader.nextEvent(); // to prevent eventual wrong reader (f.e. self-closing tag) + return defaultRelType; + } + + private boolean containsLabelKey(XMLEvent peek) { + final Attribute keyAttribute = peek.asStartElement().getAttributeByName(new QName("key")); + return keyAttribute != null && keyAttribute.getValue().equals("label"); + } + + private void addLabels(Node node, String labels) { + if (labels == null) return; + labels = labels.trim(); + if (labels.isEmpty()) return; + String[] parts = labels.split(LABEL_SPLIT); + for (String part : parts) { + if (part.trim().isEmpty()) continue; + node.addLabel(Label.label(part.trim())); + } + } + + private XMLEvent peek(XMLEventReader reader) throws XMLStreamException { + XMLEvent peek = reader.peek(); + if (peek.isCharacters() && (peek.asCharacters().isWhiteSpace())) { + reader.nextEvent(); + return peek(reader); + } + return peek; + } + + private void setDefaults(Map keys, Entity pc) { + if (keys.isEmpty()) return; + for (Key key : keys.values()) { + if (key.defaultValue != null) pc.setProperty(key.nameOrId, key.defaultValue); + } + } + + private String getAttribute(StartElement element, QName qname) { + Attribute attribute = element.getAttributeByName(qname); + return attribute != null ? attribute.getValue() : null; + } + } + + /** + * Taken from NodeExport + * placed in APOC Core + */ + interface NodeExport { + + ExportConfig.NodeConfig getNodeConfigReader(Import reader); + + enum NodeType { + SOURCE("source", Import::getSource), + + TARGET("target", Import::getTarget); + + private final String name; + private final NodeExport exportNode; + + NodeType(String name, NodeExport exportNode) { + this.name = name; + this.exportNode = exportNode; + } + + public String getName() { + return name; + } + + public String getNameType() { + return name + "Type"; + } + + NodeExport get() { + return exportNode; + } + } + } +} diff --git a/extended/src/main/java/apoc/util/ExtendedUtil.java b/extended/src/main/java/apoc/util/ExtendedUtil.java index 1190b0e371..1008964339 100644 --- a/extended/src/main/java/apoc/util/ExtendedUtil.java +++ b/extended/src/main/java/apoc/util/ExtendedUtil.java @@ -5,6 +5,7 @@ import com.fasterxml.jackson.core.json.JsonWriteFeature; import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.text.WordUtils; import org.neo4j.exceptions.Neo4jException; import org.neo4j.graphdb.Entity; import org.neo4j.graphdb.ExecutionPlanDescription; @@ -151,46 +152,61 @@ private static Object getNeo4jValue(Object object) { * For example `mapping: {myPropertyKey: "DateArray"}` */ private static Object convertValue(String value, String typeName) { + typeName = typeName.toLowerCase(); // Suitable to work with Parquet/Arrow/Gexf switch (typeName) { // {"crs":"wgs-84-3d","latitude":13.1,"longitude":33.46789,"height":100.0} - case "Point": + case "point": return getPointValue(value); - case "LocalDateTime": + case "localdatetime": return LocalDateTimeValue.parse(value).asObjectCopy(); - case "LocalTime": + case "localtime": return LocalTimeValue.parse(value).asObjectCopy(); - case "DateTime": + case "datetime": return DateTimeValue.parse(value, () -> ZoneId.of("Z")).asObjectCopy(); - case "Time": + case "time": return TimeValue.parse(value, () -> ZoneId.of("Z")).asObjectCopy(); - case "Date": + case "date": return DateValue.parse(value).asObjectCopy(); - case "Duration": + case "duration": return DurationValue.parse(value); - case "Char": + case "boolean": + return Boolean.parseBoolean(value); + case "char": return value.charAt(0); - case "Byte": + case "byte": return value.getBytes(); - case "Double": + case "double": return Double.parseDouble(value); - case "Float": + case "float": return Float.parseFloat(value); - case "Short": + case "short": return Short.parseShort(value); - case "Int": + case "int": + case "integer": return Integer.parseInt(value); - case "Long": + case "long": return Long.parseLong(value); - case "Node", "Relationship": + case "node", "relationship": return JsonUtil.parse(value, null, Map.class); + case "no_value": case "NO_VALUE": return null; + case "listboolean": + value = StringUtils.removeStart(value, "["); + value = StringUtils.removeEnd(value, "]"); + String dataType = typeName.replace("array", "").replace("list", ""); + + final Object[] arr = getPrototypeFor(dataType); + return Arrays.stream(value.split(",")) + .map(item -> convertValue(StringUtils.trim(item), dataType)) + .toList() + .toArray(arr); default: // If ends with "Array", for example StringArray - if (typeName.endsWith("Array")) { + if (typeName.endsWith("array") || typeName.startsWith("list")) { value = StringUtils.removeStart(value, "["); value = StringUtils.removeEnd(value, "]"); - String array = typeName.replace("Array", ""); + String array = typeName.replace("array", "").replace("list", ""); final Object[] prototype = getPrototypeFor(array); return Arrays.stream(value.split(",")) @@ -222,23 +238,24 @@ private static PointValue getPointValue(String value) { // similar to CsvPropertyConverter public static Object[] getPrototypeFor(String type) { + type = type.toLowerCase(); // Suitable to work with Parquet/Arrow/Gexf return switch (type) { - case "Long" -> new Long[]{}; - case "Integer" -> new Integer[]{}; - case "Double" -> new Double[]{}; - case "Float" -> new Float[]{}; - case "Boolean" -> new Boolean[]{}; - case "Byte" -> new Byte[]{}; - case "Short" -> new Short[]{}; - case "Char" -> new Character[]{}; - case "String" -> new String[]{}; - case "DateTime" -> new ZonedDateTime[]{}; - case "LocalTime" -> new LocalTime[]{}; - case "LocalDateTime" -> new LocalDateTime[]{}; - case "Point" -> new PointValue[]{}; - case "Time" -> new OffsetTime[]{}; - case "Date" -> new LocalDate[]{}; - case "Duration" -> new DurationValue[]{}; + case "long" -> new Long[]{}; + case "integer" -> new Integer[]{}; + case "double" -> new Double[]{}; + case "float" -> new Float[]{}; + case "boolean" -> new Boolean[]{}; + case "byte" -> new Byte[]{}; + case "short" -> new Short[]{}; + case "char" -> new Character[]{}; + case "string" -> new String[]{}; + case "datetime" -> new ZonedDateTime[]{}; + case "localtime" -> new LocalTime[]{}; + case "localdatetime" -> new LocalDateTime[]{}; + case "point" -> new PointValue[]{}; + case "time" -> new OffsetTime[]{}; + case "date" -> new LocalDate[]{}; + case "duration" -> new DurationValue[]{}; default -> throw new IllegalStateException("Type " + type + " not supported."); }; } diff --git a/extended/src/main/resources/extended.txt b/extended/src/main/resources/extended.txt index 6be04562fb..2889c5544a 100644 --- a/extended/src/main/resources/extended.txt +++ b/extended/src/main/resources/extended.txt @@ -81,6 +81,8 @@ apoc.graph.filterProperties apoc.import.arrow apoc.import.parquet apoc.load.csv +apoc.load.gexf +apoc.import.gexf apoc.load.csvParams apoc.load.directory apoc.load.directory.async.add diff --git a/extended/src/test/java/apoc/load/GexfTest.java b/extended/src/test/java/apoc/load/GexfTest.java new file mode 100644 index 0000000000..db38464139 --- /dev/null +++ b/extended/src/test/java/apoc/load/GexfTest.java @@ -0,0 +1,215 @@ +package apoc.load; + +import apoc.util.TestUtil; +import org.junit.After; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.neo4j.graphdb.Relationship; +import org.neo4j.graphdb.ResourceIterator; +import org.neo4j.test.rule.DbmsRule; +import org.neo4j.test.rule.ImpermanentDbmsRule; + +import java.util.List; +import java.util.Map; + +import static apoc.ApocConfig.APOC_IMPORT_FILE_ENABLED; +import static apoc.ApocConfig.APOC_IMPORT_FILE_USE_NEO4J_CONFIG; +import static apoc.ApocConfig.apocConfig; +import static apoc.util.ExtendedTestUtil.assertRelationship; +import static apoc.util.MapUtil.map; +import static apoc.util.TestUtil.testCall; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +public class GexfTest { + + @Rule + public DbmsRule db = new ImpermanentDbmsRule(); + + @Before + public void setup() { + apocConfig().setProperty(APOC_IMPORT_FILE_ENABLED, true); + apocConfig().setProperty(APOC_IMPORT_FILE_USE_NEO4J_CONFIG, false); + TestUtil.registerProcedure(db, Gexf.class); + } + + @After + public void tearDown() { + db.shutdown(); + } + + @Test + public void testLoadGexf() { + final String file = ClassLoader.getSystemResource("gexf/single-node.gexf").toString(); + testCall( + db, + "CALL apoc.load.gexf($file)", + Map.of("file", file), + (row) -> { + Map value = (Map) row.get("value"); + String expected = "{_type=gexf, _children=[{_type=graph, defaultedgetype=directed, _children=[{_type=nodes, _children=[{_type=node, _children=[{_type=attvalues, _children=[{_type=attvalue, for=0, value=http://gephi.org}]}], id=0, label=bar}]}]}], version=1.2}"; + assertEquals(expected, value.toString()); + }); + } + + @Test + public void testImportGexf() { + final String file = ClassLoader.getSystemResource("gexf/data.gexf").toString(); + TestUtil.testCall( + db, + "CALL apoc.import.gexf($file, {readLabels:true})", + map("file", file), + (r) -> { + assertEquals("gexf", r.get("format")); + assertEquals(5L, r.get("nodes")); + assertEquals(8L, r.get("relationships")); + }); + + TestUtil.testCallCount(db, "MATCH (n) RETURN n",5); + + TestUtil.testResult(db, "MATCH (n:Gephi) RETURN properties(n) as props", r -> { + ResourceIterator propsIterator = r.columnAs("props"); + Map props = propsIterator.next(); + assertEquals("http://gephi.org", props.get("0")); + assertEquals(1.0f, props.get("1")); + + props = propsIterator.next(); + assertEquals("http://test.gephi.org", props.get("0")); + }); + + TestUtil.testResult(db, "MATCH (n:BarabasiLab) RETURN properties(n) as props", r -> { + ResourceIterator propsIterator = r.columnAs("props"); + Map props = propsIterator.next(); + assertEquals("http://barabasilab.com", props.get("0")); + assertEquals(1.0f, props.get("1")); + }); + + Map multiDataTypeNodeProps = Map.of( + "0", "http://gephi.org", + "1", 1.0f, + "room", 10, + "price", Double.parseDouble("10.02"), + "projects", 300L, + "members", new String[] {"Altomare", "Sterpeto", "Lino"}, + "pins", new boolean[]{true, false, true, false} + ); + + TestUtil.testResult( + db, + "MATCH ()-[rel]->() RETURN rel ORDER BY rel.score", + r -> { + final ResourceIterator rels = r.columnAs("rel"); + + assertRelationship(rels.next(), "KNOWS", Map.of("score", 1.5f), + List.of("Gephi"), multiDataTypeNodeProps, + List.of("Webatlas"), Map.of("0", "http://webatlas.fr", "1", 2.0f) + ); + + assertRelationship(rels.next(), "BAZ", + Map.of("score", 2.0f, "foo", "bar"), + List.of("Gephi"), multiDataTypeNodeProps, + List.of("Gephi"), multiDataTypeNodeProps + ); + + assertRelationship(rels.next(), "HAS_TICKET", Map.of("score", 3f, "ajeje", "brazorf"), + List.of("Gephi"), + multiDataTypeNodeProps, + List.of("RTGI"), + Map.of("0", "http://rtgi.fr", "1", 1.0f) + ); + + assertRelationship(rels.next(), "KNOWS", + Map.of(), + List.of("Gephi"), + multiDataTypeNodeProps, + List.of("RTGI"), + Map.of("0", "http://rtgi.fr", "1", 1.0f) + ); + + assertRelationship(rels.next(), "KNOWS", + Map.of(), + List.of("Webatlas"), + Map.of("0", "http://webatlas.fr", "1", 2.0f), + List.of("Gephi"), + multiDataTypeNodeProps + ); + + assertRelationship(rels.next(), "KNOWS", + Map.of(), + List.of("RTGI"), Map.of("0", "http://rtgi.fr", "1", 1.0f), + List.of("Webatlas"), Map.of("0", "http://webatlas.fr", "1", 2.0f) + ); + + assertRelationship(rels.next(), "KNOWS", + Map.of(), + List.of("Gephi"), + multiDataTypeNodeProps, + List.of("Webatlas", "BarabasiLab"), + Map.of("0", "http://barabasilab.com", "1", 1.0f, "2", false) + ); + + assertRelationship(rels.next(), "KNOWS", + Map.of(), + List.of("Gephi"), + Map.of("0", "http://test.gephi.org", "1", 2.0f), + List.of("Webatlas", "BarabasiLab"), + Map.of("0", "http://barabasilab.com", "1", 1.0f, "2", false) + ); + + assertFalse(rels.hasNext()); + } + ); + } + + @Test + public void testImportGexfWithStoreNodeIds() { + final String file = ClassLoader.getSystemResource("gexf/single-node.gexf").toString(); + TestUtil.testCall( + db, + "CALL apoc.import.gexf($file, {storeNodeIds: true})", + map("file", file), + (r) -> { + assertEquals("gexf", r.get("format")); + assertEquals(1L, r.get("nodes")); + }); + + Map props = TestUtil.singleResultFirstColumn(db, "MATCH (n) RETURN properties(n) AS props"); + assertEquals("http://gephi.org", props.get("0")); + assertTrue( props.containsKey("id") ); + } + + @Test + public void testImportGexfWithDefaultRelationshipTypeSourceAndTargetConfigs() { + String defaultRelType = "TEST_DEFAULT"; + final String file = ClassLoader.getSystemResource("gexf/single-rel.gexf").toString(); + + db.executeTransactionally("CREATE (:Foo {startId: 'start'})"); + db.executeTransactionally("CREATE (:Bar {endId: 'end'})"); + + TestUtil.testCall( + db, + "CALL apoc.import.gexf($file, {defaultRelationshipType: $defaultRelType, source: $source, target: $target})", + map("file", file, + "defaultRelType", defaultRelType, + "source", map("label", "Foo", "id", "startId"), + "target", map("label", "Bar", "id", "endId") + ), + (r) -> { + assertEquals("gexf", r.get("format")); + assertEquals(1L, r.get("relationships")); + }); + + TestUtil.testCall(db, "MATCH ()-[rel]->() RETURN rel", r -> { + Relationship rel = (Relationship) r.get("rel"); + assertRelationship(rel, defaultRelType, + Map.of(), + List.of("Foo"), + Map.of("startId", "start"), + List.of("Bar"), + Map.of("endId", "end") + ); + }); + } +} diff --git a/extended/src/test/java/apoc/util/ExtendedTestUtil.java b/extended/src/test/java/apoc/util/ExtendedTestUtil.java index 2a3d0fbbba..8f467797c5 100644 --- a/extended/src/test/java/apoc/util/ExtendedTestUtil.java +++ b/extended/src/test/java/apoc/util/ExtendedTestUtil.java @@ -1,18 +1,28 @@ package apoc.util; +import apoc.util.collection.Iterables; import org.neo4j.graphdb.GraphDatabaseService; +import org.neo4j.graphdb.Label; +import org.neo4j.graphdb.Node; +import org.neo4j.graphdb.Relationship; +import org.neo4j.graphdb.RelationshipType; import org.neo4j.graphdb.Result; import org.neo4j.graphdb.ResultTransformer; import org.neo4j.graphdb.security.URLAccessChecker; +import org.neo4j.internal.helpers.collection.Iterators; import org.neo4j.test.assertion.Assert; import java.util.Collections; +import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.function.Consumer; +import java.util.stream.Collectors; import static apoc.util.TestUtil.testCall; import static apoc.util.TestUtil.testCallAssertions; +import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; @@ -28,6 +38,33 @@ public static class MockURLAccessChecker { public static final URLAccessChecker INSTANCE = url -> url; } + public static void assertRelationship(Relationship rel, + String expectedRelType, Map expectedProps, + List expectedStartNodeLabels, + Map expectedStartNodeProps, + List expectedEndNodeLabels, + Map expectedEndNodeProps) { + + Node startNode = rel.getStartNode(); + Node endNode = rel.getEndNode(); + + assertMapEquals(expectedProps, rel.getAllProperties()); + assertEquals(RelationshipType.withName(expectedRelType), rel.getType()); + + Set