Improves DataFrame to RDF conversion, adds support for RDF to DataFrame conversion, updates dependencies.

cadmiumkitty · cadmiumkitty · commit ded56b15c219 · 2021-01-03T14:27:45.000Z
diff --git a/.gitignore b/.gitignore
@@ -97,3 +97,7 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+
+### VisualStudio Code ###
+
+.vscode/
diff --git a/README.rst b/README.rst
@@ -1,12 +1,9 @@
 RdfPandas
 =========
 
-RdfPandas is a module providing RDF support for Pandas. It consists initially 
-of a simple function for graph conversion to create RDFLib Graph data from 
-Pandas DataFrame.
-
-The graph data can then be serialized using RDFLib serialize method on the 
-graph.
+RdfPandas is a module providing RDF support for Pandas. It consists of
+two simple functions for Graph to DataFrame conversion and 
+DataFrame to Graph conversion.
 
 Getting Started
 ---------------
@@ -31,7 +28,7 @@ Installation
 Usage
 -----
 
-Getting RDF out of the DataFrame
+Creating RDF from DataFrame
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 ::
@@ -40,6 +37,27 @@ Getting RDF out of the DataFrame
   import pandas as pd
   import rdflib
  
-  df = pd.DataFrame()
+  df = pd.read_csv('to_graph_test.csv', index_col = '@id', keep_default_na = False)
   g = to_graph(df)
   s = g.serialize(format='turtle')
+
+Creating DataFrame from RDF
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+::
+
+  import rdfpandas.graph
+  import pandas as pd
+  import rdflib
+ 
+  g = rdflib.Graph()
+  g.parse('to_df_test.ttl', format = 'ttl')
+  df = to_dataframe(g)  
+  df.to_csv('test.csv', index = True, index_label = "@id")
+
+Gotchas
+-------
+
+No special effort is made for dealing with types, so please be aware of Pandas
+features such as https://pandas.pydata.org/pandas-docs/stable/user_guide/gotchas.html#support-for-integer-na
+that may result in surprising RDF statements like ``"10.0"^^<xsd:integer>``.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -20,13 +20,13 @@
 # -- Project information -----------------------------------------------------
 
 project = 'rdfpandas'
-copyright = '2018, Eugene Morozov'
+copyright = '2020, Eugene Morozov'
 author = 'Eugene Morozov'
 
 # The short X.Y version
-version = 'v0.1.1'
+version = 'v1.0.0'
 # The full version, including alpha/beta/rc tags
-release = 'v0.1.1'
+release = 'v1.0.0'
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -15,12 +15,9 @@ RdfPandas
 Introduction
 ============
    
-RdfPandas is a module providing RDF support for Pandas. It consists initially 
-of a simple function for graph conversion to create RDFLib Graph data from 
-Pandas DataFrame.
-
-The graph data can then be serialized using RDFLib serialize method on the 
-graph.
+RdfPandas is a module providing RDF support for Pandas. It consists of
+two simple functions for Graph to DataFrame conversion and 
+DataFrame to Graph conversion.
 
 For more details and the source code see https://github.com/cadmiumkitty/rdfpandas/
 
diff --git a/rdfpandas/__init__.py b/rdfpandas/__init__.py
@@ -1 +1 @@
-from .graph import to_graph
+from .graph import to_graph, to_dataframe
diff --git a/rdfpandas/graph.py b/rdfpandas/graph.py
@@ -1,14 +1,21 @@
 # -*- coding: utf-8 -*-
 import pandas as pd
-import rdflib
-import logging
+import numpy as np
+from rdflib import Graph, Literal, URIRef, BNode
+from rdflib.term import Identifier
+from rdflib.namespace import NamespaceManager
+import re
 
-
-def to_graph(df: pd.DataFrame) -> rdflib.Graph:
+def to_graph(df: pd.DataFrame) -> Graph:
     """
-    Takes Pandas DataFrame and returns RDFLib Graph using row indices as subjects
-    and column indices as predictes. Object types are inferred from the Series 
-    content type.
+    Takes Pandas DataFrame and returns RDFLib Graph.
+    Row indices are used as subjects and column indices as predicates. 
+    Object types are inferred from the column index pattern of 
+    "predicate{rdfLib Identifier instance class name}(type)[index]@language".
+    Index numbers simply create additoinal statements as opposed 
+    to attempting to construct a new rdfs:List or rdfs:Container.
+    Namespaces need to be bound by the user of the method prior
+    to serialization.
 
     Parameters
     ----------
@@ -18,21 +25,219 @@ def to_graph(df: pd.DataFrame) -> rdflib.Graph:
     Returns
     -------
     rdflib.Graph
-        Graph created from Pandas DataFrame
+        Graph created from Pandas DataFrame.
 
     """
     
-    g = rdflib.Graph()
-    
+    g = Graph()
+
     for (index, series) in df.iterrows():
         for (column, value) in series.iteritems():
-            if (type(value) == 'bytes'):
-                g.add((rdflib.URIRef(index),
-                       rdflib.URIRef(column), 
-                       rdflib.Literal(value.decode('utf-8'))))
-            else:
-                g.add((rdflib.URIRef(index),
-                       rdflib.URIRef(column), 
-                       rdflib.Literal(value)))
-        
+            match = re.search('([\w?:/.]*)(\{(\w*)\})?(\[(\d*)\])?(\(([\w?:/.]*)\))?(@(\w*))?', column)
+
+            if pd.notna(value) and pd.notnull(value):
+                s = _get_identifier(index)
+                p = _get_identifier(match.group(1))
+                if isinstance(value, bytes):
+                    o = _get_identifier(value.decode('utf-8'), match.group(3), match.group(7), match.group(9))
+                else:
+                    o = _get_identifier(value, match.group(3), match.group(7), match.group(9))
+                g.add((s, p, o))
+
     return g
+
+
+def to_dataframe(g: Graph) -> pd.DataFrame:
+    """
+    Takes rdfLib Graph object and creates Pandas DataFrame.
+    Indices are subjects and attempt is made to construct CURIEs
+    using namespace manager of the rdfLib Graph.
+    Columns are predicates and attempt is made to construct CURIEs
+    using namespace manager of the rdfLib Graph, similar to indices.
+    Column names are created using 
+    "predicate{rdfLib Identifier instance class name}(type)[index]@language"
+    pattern to allow for round trip conversion.
+    Multiple objects for the same subject and predicate
+    result in columns with index in its name.
+    No attemps are made at type conversion, all objects are strings in the
+    DataFrame.
+
+    Parameters
+    ----------
+    g : rdflib.Graph
+        rdfLib Graph.
+
+    Returns
+    -------
+    pd.DataFrame
+        Pandas DataFrame created from rdfLib Graph.
+
+    """
+
+    subjects = {}
+    predicates = {}
+
+    for s in g.subjects():
+        if s not in subjects:
+            subjects[s] = s
+            s_predicates = {}
+            for p, o in sorted(g.predicate_objects(s)):
+                idl = _get_idl_for_identifier(o)
+                if p in s_predicates:
+                    if idl in s_predicates[p]:
+                        s_predicates[p][idl] = s_predicates[p][idl] + 1
+                    else:
+                        s_predicates[p][idl] = 1
+                else:
+                    s_predicates[p] = {idl: 1}
+            for p in s_predicates:
+                if p in predicates:
+                    for idl in s_predicates[p]:
+                        if idl in predicates[p]:
+                            predicates[p][idl] = max(s_predicates[p][idl], predicates[p][idl])
+                        else:
+                            predicates[p][idl] = s_predicates[p][idl]
+                else:
+                    predicates[p] = s_predicates[p]
+
+    series = {}
+
+    for p in predicates:
+        idls_len = len(predicates[p])
+        for idl in predicates[p]:
+            instance = idl[0]
+            datatype = idl[1]
+            language = idl[2]
+            idl_len = predicates[p][idl]
+            for index in range(idl_len):
+                series_name = f'{_get_str_for_uri(g.namespace_manager, p)}{{{instance}}}'
+                if idl_len > 1:
+                    series_name = ''.join([series_name, f'[{index}]'])
+                if datatype:
+                    series_name = ''.join([series_name, f'({_get_str_for_uri(g.namespace_manager, datatype)})'])
+                if language:
+                    series_name = ''.join([series_name, f'@{language}'])
+                p_subjects = []
+                p_objects = []
+                if idls_len == 1 and idl_len == 1:
+                    for s, o in sorted(g.subject_objects(p)):
+                        p_subjects.append(_get_str_for_uri(g.namespace_manager, s))
+                        if isinstance(o, Literal):
+                            p_objects.append(str(o))
+                        else:
+                            p_objects.append(_get_str_for_uri(g.namespace_manager, o))
+                else:
+                    s_index = 0
+                    last_seen_subject = None
+                    for s, o in sorted(g.subject_objects(p)):
+                        if s != last_seen_subject:
+                            s_index = 0
+                        o_idl = _get_idl_for_identifier(o)
+                        if o_idl == idl:
+                            if s_index == index:
+                                p_subjects.append(_get_str_for_uri(g.namespace_manager, s))
+                                if isinstance(o, Literal):
+                                    p_objects.append(str(o))
+                                else:
+                                    p_objects.append(_get_str_for_uri(g.namespace_manager, o))
+                            s_index = s_index + 1
+                        last_seen_subject = s
+                series[series_name] = pd.Series(data = p_objects, index = p_subjects, dtype = np.unicode_)
+
+    return pd.DataFrame(series)
+
+def _get_identifier(value: object, instance: str = None, datatype: str = None, language: str = None) -> Identifier:
+    """
+    Takes value extracted from the index, column or cell and returns
+    an instance of Identifier (Literal, URIRef or BNode) using correct 
+    datatype and language.
+
+    Parameters
+    ----------
+    value : object
+        Value of index, column or cell
+    instance : str
+        Name of the rdfLib Identifier class to use
+    datatype : str
+        Datatype of rdfLib Literal to use 
+        (see https://rdflib.readthedocs.io/en/stable/rdf_terms.html#python-support)
+    language : str
+        Language of rdfLib Literal to use 
+
+    Returns
+    -------
+    rdflib.term.Identifier
+        rdflib.term.Identifier instance - either URIRef or Literal.
+
+    """
+
+    if not instance:
+        if language:
+            return Literal(value, lang = language)
+        elif datatype:
+            return Literal(value, datatype = URIRef(datatype))
+        elif re.match('^\w*:\w*$', str(value)) or re.match('^http[s]?://.*$', str(value)):
+            return URIRef(value)
+        else:
+            return Literal(value)
+    elif instance == Literal.__name__:
+        if language:
+            return Literal(value, lang = language)
+        elif datatype:
+            return Literal(value, datatype = URIRef(datatype))
+        else:
+            return Literal(value)
+    elif instance == URIRef.__name__:
+        return URIRef(value)
+    elif instance == BNode.__name__:
+        return BNode(value)
+
+    raise ValueError(f'Can only create Literal, URIRef or BNode but was {instance}')
+
+def _get_idl_for_identifier(i: Identifier) -> tuple:
+    """
+    Takes rdfLib Identifier, and returns a tuple of 
+    instance name (Literal, URIRef or BNode), datatype (XSD type) and language.
+
+    Parameters
+    ----------
+    i : rdflib.term.Identifier
+        rdfLib Identifier (parent of BNode, Literal or URIRef).
+
+    Returns
+    -------
+    tuple
+        tuple of instance, datatype and language.
+
+    """
+
+    instance = i.__class__.__name__
+    datatype = None
+    language = None
+    if isinstance(i, Literal):
+        datatype = i.datatype
+        language = i.language
+
+    return (instance, datatype, language)
+
+def _get_str_for_uri(namespace_manager: NamespaceManager, uri: URIRef) -> str:
+    """
+    Reusing NamespaceManager.normalizeUri for transforming Graph to DataFrame.
+    In effect we only need to strip < and > from N3 representation and
+    forget the case of URIRef being a rdflib.term.Variable.
+
+    Parameters
+    ----------
+    namespace_manager : rdflib.namespace.NamespaceManager
+        NamespaceManager to use to normalize URIs
+    uri : rdflib.URIRef
+        NamespaceManager to use to normalize URIs
+
+    Returns
+    -------
+    str
+        Normalised URI string.
+
+    """
+
+    return re.sub('<|>', '', namespace_manager.normalizeUri(uri))
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 nose
 sphinx
 sphinx_rtd_theme
-pandas>=0.22.0
-rdflib>=4.2.2
+pandas>=1.2.0
+rdflib>=5.0.0
diff --git a/setup.py b/setup.py
@@ -11,7 +11,7 @@
 
 setup(
     name='rdfpandas',
-    version='0.1.1',
+    version='1.0.0',
     description='RDF support for Pandas',
     long_description=readme,
     author='Eugene Morozov',
diff --git a/tests/csv/test.csv b/tests/csv/test.csv
@@ -0,0 +1,3 @@
+@id,rdfpandas:curie{URIRef},rdfpandas:double{Literal}(xsd:double),rdfpandas:integer{Literal}(xsd:integer),rdfpandas:string{Literal},rdfpandas:string{Literal}[0]@en,rdfpandas:string{Literal}[1]@en,rdfpandas:string{Literal}[0]@ne,rdfpandas:string{Literal}[1]@ne,rdfpandas:string{Literal}@ru,rdfpandas:string{Literal}[0](xsd:string),rdfpandas:string{Literal}[1](xsd:string),rdfpandas:uri{URIRef},rdfpandas:anotherstring{Literal}
+rdfpandas:one,skos:broader,10.0,10,String 1,String in English 1 (1),String in English 2 (1),String in Nepali 1 (1),,String in Russian 1 (1),String with type 1 (1),String with type 2 (1),https://google.com,
+rdfpandas:two,,20.0,20,,String in English 1 (2),,String in Nepali 1 (2),String in Nepali 2 (2),,String with type 1 (2),String with type 2 (2),,String 2
diff --git a/tests/rdf/test.ttl b/tests/rdf/test.ttl
diff --git a/tests/test_data_frame_to_graph.py b/tests/test_data_frame_to_graph.py
diff --git a/tests/test_graph.py b/tests/test_graph.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-from .graph import to_graph`
	`1`	`+from .graph import to_graph, to_dataframe`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+@id,rdfpandas:curie{URIRef},rdfpandas:double{Literal}(xsd:double),rdfpandas:integer{Literal}(xsd:integer),rdfpandas:string{Literal},rdfpandas:string{Literal}[0]@en,rdfpandas:string{Literal}[1]@en,rdfpandas:string{Literal}[0]@ne,rdfpandas:string{Literal}[1]@ne,rdfpandas:string{Literal}@ru,rdfpandas:string{Literal}[0](xsd:string),rdfpandas:string{Literal}[1](xsd:string),rdfpandas:uri{URIRef},rdfpandas:anotherstring{Literal}`
	`2`	`+rdfpandas:one,skos:broader,10.0,10,String 1,String in English 1 (1),String in English 2 (1),String in Nepali 1 (1),,String in Russian 1 (1),String with type 1 (1),String with type 2 (1),https://google.com,`
	`3`	`+rdfpandas:two,,20.0,20,,String in English 1 (2),,String in Nepali 1 (2),String in Nepali 2 (2),,String with type 1 (2),String with type 2 (2),,String 2`