Fixes #4 by adding backwards-compatible support for rdflib NamespaeManager.

cadmiumkitty · web-flow · commit f50e8ea3fc33 · 2021-01-10T18:34:40.000Z
diff --git a/README.rst b/README.rst
@@ -31,15 +31,20 @@ Usage
 Creating RDF from DataFrame
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+As of version 1.1.0 NamespaceManager can be supplied to ``rdflib.to_graph`` for conversion to Graph.
+
 ::
 
   import rdfpandas.graph
   import pandas as pd
   import rdflib
  
   df = pd.read_csv('to_graph_test.csv', index_col = '@id', keep_default_na = False)
-  g = to_graph(df)
-  s = g.serialize(format='turtle')
+  namespace_manager = NamespaceManager(Graph())
+  namespace_manager.bind('skos', SKOS)
+  namespace_manager.bind('rdfpandas', Namespace('http://github.com/cadmiumkitty/rdfpandas/'))
+  g = to_graph(df, namespace_manager)
+  s = g.serialize(format = 'turtle')
 
 Creating DataFrame from RDF
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -20,13 +20,13 @@
 # -- Project information -----------------------------------------------------
 
 project = 'rdfpandas'
-copyright = '2020, Eugene Morozov'
+copyright = '2021, Eugene Morozov'
 author = 'Eugene Morozov'
 
 # The short X.Y version
-version = 'v1.0.0'
+version = 'v1.1.0'
 # The full version, including alpha/beta/rc tags
-release = 'v1.0.0'
+release = 'v1.1.0'
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/rdfpandas/graph.py b/rdfpandas/graph.py
@@ -6,7 +6,7 @@
 from rdflib.namespace import NamespaceManager
 import re
 
-def to_graph(df: pd.DataFrame) -> Graph:
+def to_graph(df: pd.DataFrame, namespace_manager: NamespaceManager = None) -> Graph:
     """
     Takes Pandas DataFrame and returns RDFLib Graph.
     Row indices are used as subjects and column indices as predicates. 
@@ -21,6 +21,8 @@ def to_graph(df: pd.DataFrame) -> Graph:
     ----------
     df : pandas.DataFrame
         DataFrame to be converted into Graph.
+    namespace_manager : rdflib.namespace.NamespaceManager
+        NamespaceManager to use to normalize URIs
 
     Returns
     -------
@@ -29,19 +31,23 @@ def to_graph(df: pd.DataFrame) -> Graph:
 
     """
     
-    g = Graph()
+    g = Graph(namespace_manager = namespace_manager)
+
+    prefixes = {}
+    for (prefix, namespace) in g.namespace_manager.namespaces():
+        prefixes[prefix] = namespace
 
     for (index, series) in df.iterrows():
         for (column, value) in series.iteritems():
             match = re.search('([\w?:/.]*)(\{(\w*)\})?(\[(\d*)\])?(\(([\w?:/.]*)\))?(@(\w*))?', column)
 
             if pd.notna(value) and pd.notnull(value):
-                s = _get_identifier(index)
-                p = _get_identifier(match.group(1))
+                s = _get_identifier(prefixes, index)
+                p = _get_identifier(prefixes, match.group(1))
                 if isinstance(value, bytes):
-                    o = _get_identifier(value.decode('utf-8'), match.group(3), match.group(7), match.group(9))
+                    o = _get_identifier(prefixes, value.decode('utf-8'), match.group(3), match.group(7), match.group(9))
                 else:
-                    o = _get_identifier(value, match.group(3), match.group(7), match.group(9))
+                    o = _get_identifier(prefixes, value, match.group(3), match.group(7), match.group(9))
                 g.add((s, p, o))
 
     return g
@@ -110,22 +116,22 @@ def to_dataframe(g: Graph) -> pd.DataFrame:
             language = idl[2]
             idl_len = predicates[p][idl]
             for index in range(idl_len):
-                series_name = f'{_get_str_for_uri(g.namespace_manager, p)}{{{instance}}}'
+                series_name = f'{_get_str_for_uriref(g.namespace_manager, p)}{{{instance}}}'
                 if idl_len > 1:
                     series_name = ''.join([series_name, f'[{index}]'])
                 if datatype:
-                    series_name = ''.join([series_name, f'({_get_str_for_uri(g.namespace_manager, datatype)})'])
+                    series_name = ''.join([series_name, f'({_get_str_for_uriref(g.namespace_manager, datatype)})'])
                 if language:
                     series_name = ''.join([series_name, f'@{language}'])
                 p_subjects = []
                 p_objects = []
                 if idls_len == 1 and idl_len == 1:
                     for s, o in sorted(g.subject_objects(p)):
-                        p_subjects.append(_get_str_for_uri(g.namespace_manager, s))
+                        p_subjects.append(_get_str_for_uriref(g.namespace_manager, s))
                         if isinstance(o, Literal):
                             p_objects.append(str(o))
                         else:
-                            p_objects.append(_get_str_for_uri(g.namespace_manager, o))
+                            p_objects.append(_get_str_for_uriref(g.namespace_manager, o))
                 else:
                     s_index = 0
                     last_seen_subject = None
@@ -135,25 +141,27 @@ def to_dataframe(g: Graph) -> pd.DataFrame:
                         o_idl = _get_idl_for_identifier(o)
                         if o_idl == idl:
                             if s_index == index:
-                                p_subjects.append(_get_str_for_uri(g.namespace_manager, s))
+                                p_subjects.append(_get_str_for_uriref(g.namespace_manager, s))
                                 if isinstance(o, Literal):
                                     p_objects.append(str(o))
                                 else:
-                                    p_objects.append(_get_str_for_uri(g.namespace_manager, o))
+                                    p_objects.append(_get_str_for_uriref(g.namespace_manager, o))
                             s_index = s_index + 1
                         last_seen_subject = s
                 series[series_name] = pd.Series(data = p_objects, index = p_subjects, dtype = np.unicode_)
 
     return pd.DataFrame(series)
 
-def _get_identifier(value: object, instance: str = None, datatype: str = None, language: str = None) -> Identifier:
+def _get_identifier(prefixes: dict, value: object, instance: str = None, datatype: str = None, language: str = None) -> Identifier:
     """
     Takes value extracted from the index, column or cell and returns
     an instance of Identifier (Literal, URIRef or BNode) using correct 
     datatype and language.
 
     Parameters
     ----------
+    prefixes : dict
+        Prefixes to use to normalize URIs
     value : object
         Value of index, column or cell
     instance : str
@@ -176,19 +184,32 @@ def _get_identifier(value: object, instance: str = None, datatype: str = None, l
             return Literal(value, lang = language)
         elif datatype:
             return Literal(value, datatype = URIRef(datatype))
-        elif re.match('^\w*:\w*$', str(value)) or re.match('^http[s]?://.*$', str(value)):
+        elif _is_uri(value):
             return URIRef(value)
+        elif _is_curie(value):
+            return _get_uriref_for_curie(prefixes, value)
         else:
             return Literal(value)
     elif instance == Literal.__name__:
         if language:
             return Literal(value, lang = language)
         elif datatype:
-            return Literal(value, datatype = URIRef(datatype))
+            if _is_uri(datatype):
+                datatype_uriref = URIRef(datatype)
+            elif _is_curie(datatype):
+                datatype_uriref = _get_uriref_for_curie(prefixes, datatype)
+            else:
+                ValueError(f'Not a valid URI for datatype {datatype}')  
+            return Literal(value, datatype = datatype_uriref)
         else:
             return Literal(value)
     elif instance == URIRef.__name__:
-        return URIRef(value)
+        if _is_uri(value):
+            return URIRef(value)
+        elif _is_curie(value):
+            return _get_uriref_for_curie(prefixes, value)
+        else:
+            ValueError(f'Not a valid URI {value}')  
     elif instance == BNode.__name__:
         return BNode(value)
 
@@ -220,7 +241,7 @@ def _get_idl_for_identifier(i: Identifier) -> tuple:
 
     return (instance, datatype, language)
 
-def _get_str_for_uri(namespace_manager: NamespaceManager, uri: URIRef) -> str:
+def _get_str_for_uriref(namespace_manager: NamespaceManager, uriref: URIRef) -> str:
     """
     Reusing NamespaceManager.normalizeUri for transforming Graph to DataFrame.
     In effect we only need to strip < and > from N3 representation and
@@ -230,8 +251,8 @@ def _get_str_for_uri(namespace_manager: NamespaceManager, uri: URIRef) -> str:
     ----------
     namespace_manager : rdflib.namespace.NamespaceManager
         NamespaceManager to use to normalize URIs
-    uri : rdflib.URIRef
-        NamespaceManager to use to normalize URIs
+    uriref : rdflib.URIRef
+        URI to normalize
 
     Returns
     -------
@@ -240,4 +261,65 @@ def _get_str_for_uri(namespace_manager: NamespaceManager, uri: URIRef) -> str:
 
     """
 
-    return re.sub('<|>', '', namespace_manager.normalizeUri(uri))
+    return re.sub('<|>', '', namespace_manager.normalizeUri(uriref))
+
+def _get_uriref_for_curie(prefixes: dict, value: object) -> URIRef:
+    """
+    Converts curie string into URIRef with fully qualified URI.
+
+    Parameters
+    ----------
+    prefixes : dict
+        Prefixes to use to normalize URIs
+    value : object
+        Value from DataFrame to be converted to URIRef.
+
+    Returns
+    -------
+    rdflib.URIRef
+        URIRef created from the string.
+
+    """
+
+    prefix, name = value.split(':')
+    if prefix in prefixes:
+        return URIRef(''.join((prefixes[prefix], name)))
+    else:
+        return URIRef(value)
+
+def _is_curie(value: object) -> bool:
+    """
+    Checks if value from DataFrame is a CURIE.
+
+    Parameters
+    ----------
+    value : object
+        Value from DataFrame to be checked.
+
+    Returns
+    -------
+    bool
+        True if value is matching CURIE pattern, false otherwise.
+
+    """
+
+    return re.match('^\w*:\w*$', str(value))
+
+def _is_uri(value: object) -> bool:
+    """
+    Checks if value from DataFrame is a URI.
+
+    Parameters
+    ----------
+    value : object
+        Value from DataFrame to be checked.
+
+    Returns
+    -------
+    bool
+        True if value is matching URI pattern, false otherwise.
+
+    """
+
+    return re.match('^http[s]?://.*$', str(value))
+
diff --git a/setup.py b/setup.py
@@ -10,14 +10,15 @@
     license = f.read()
 
 setup(
-    name='rdfpandas',
-    version='1.0.0',
-    description='RDF support for Pandas',
-    long_description=readme,
-    author='Eugene Morozov',
-    author_email='emorozov@gmail.com',
-    url='https://github.com/cadmiumkitty/rdfpandas',
-    license='MIT',
-    packages=find_packages(exclude=('tests', 'docs'))
+    name = 'rdfpandas',
+    version = '1.1.0',
+    description = 'RDF support for Pandas',
+    long_description = readme,
+    author = 'Eugene Morozov',
+    author_email = 'emorozov@gmail.com',
+    url = 'https://github.com/cadmiumkitty/rdfpandas',
+    license = 'MIT',
+    packages = find_packages(exclude = ('tests', 'docs')),
+    install_requires = ['pandas>=1.2.0', 'rdflib>=5.0.0']
 )
 
diff --git a/tests/rdf/test.ttl b/tests/rdf/test.ttl
@@ -1,23 +1,24 @@
 @prefix rdfpandas: <http://github.com/cadmiumkitty/rdfpandas/> .
 @prefix skos: <http://www.w3.org/2004/02/skos/core#> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
 
-<rdfpandas:one> <rdfpandas:curie> <skos:broader> ;
-    <rdfpandas:double> "10.0"^^<xsd:double> ;
-    <rdfpandas:integer> "10"^^<xsd:integer> ;
-    <rdfpandas:string> "String 1",
+rdfpandas:one rdfpandas:curie skos:broader ;
+    rdfpandas:double "10.0"^^xsd:double ;
+    rdfpandas:integer "10"^^xsd:integer ;
+    rdfpandas:string "String 1",
         "String in English 1 (1)"@en,
         "String in English 2 (1)"@en,
         "String in Nepali 1 (1)"@ne,
         "String in Russian 1 (1)"@ru,
-        "String with type 1 (1)"^^<xsd:string>,
-        "String with type 2 (1)"^^<xsd:string> ;
-    <rdfpandas:uri> <https://google.com> .
+        "String with type 1 (1)"^^xsd:string,
+        "String with type 2 (1)"^^xsd:string ;
+    rdfpandas:uri <https://google.com> .
 
-<rdfpandas:two> <rdfpandas:anotherstring> "String 2" ;
-    <rdfpandas:double> "20.0"^^<xsd:double> ;
-    <rdfpandas:integer> "20"^^<xsd:integer> ;
-    <rdfpandas:string> "String in English 1 (2)"@en,
+rdfpandas:two rdfpandas:anotherstring "String 2" ;
+    rdfpandas:double "20.0"^^xsd:double ;
+    rdfpandas:integer "20"^^xsd:integer ;
+    rdfpandas:string "String in English 1 (2)"@en,
         "String in Nepali 1 (2)"@ne,
         "String in Nepali 2 (2)"@ne,
-        "String with type 1 (2)"^^<xsd:string>,
-        "String with type 2 (2)"^^<xsd:string> .
+        "String with type 1 (2)"^^xsd:string,
+        "String with type 2 (2)"^^xsd:string .
diff --git a/tests/test_graph.py b/tests/test_graph.py
@@ -5,9 +5,9 @@
 import pandas as pd
 import numpy as np
 
-from rdflib import Graph, Literal, URIRef, BNode
+from rdflib import Graph, Literal, URIRef, BNode, Namespace
 from rdflib.term import Identifier
-from rdflib.namespace import NamespaceManager
+from rdflib.namespace import NamespaceManager, SKOS, XSD
 import rdflib.compare
 
 import unittest
@@ -156,7 +156,7 @@ def test_should_convert_data_frame_to_graph_literal(self):
                         Literal('String')))
         g_expected.add((URIRef('http://github.com/cadmiumkitty/rdfpandas/one'),
                         URIRef('http://github.com/cadmiumkitty/rdfpandas/string'),
-                        Literal('String with type only', datatype = URIRef('xsd:string'))))
+                        Literal('String with type only', datatype = XSD.string)))
         g_expected.add((URIRef('http://github.com/cadmiumkitty/rdfpandas/one'),
                         URIRef('http://github.com/cadmiumkitty/rdfpandas/string'),
                         Literal('String with language only in Nepali', lang = 'ne')))
@@ -334,7 +334,10 @@ def test_should_roundtrip_csv_to_graph_to_csv(self):
         """
 
         df = pd.read_csv('./csv/test.csv', index_col = '@id', keep_default_na = True)
-        g = rdfpandas.to_graph(df)
+        namespace_manager = NamespaceManager(Graph())
+        namespace_manager.bind('skos', SKOS)
+        namespace_manager.bind('rdfpandas', Namespace('http://github.com/cadmiumkitty/rdfpandas/'))
+        g = rdfpandas.to_graph(df, namespace_manager)
         df_result = rdfpandas.to_dataframe(g)
 
         pd.testing.assert_frame_equal(df.astype(np.unicode_), df_result.astype(np.unicode_), check_like = True, check_names = False)
@@ -347,8 +350,8 @@ def test_should_roundtrip_graph_to_csv_to_graph(self):
         g = rdflib.Graph()
         g.parse('./rdf/test.ttl', format = 'ttl')
         df = rdfpandas.to_dataframe(g)
-        g_result = rdfpandas.to_graph(df)
-
+        print(df.T)
+        g_result = rdfpandas.to_graph(df, g.namespace_manager)
         self.assertEquals(rdflib.compare.isomorphic(g, g_result), True)