Skip to content

Commit f50e8ea

Browse files
authored
Fixes #4 by adding backwards-compatible support for rdflib NamespaeManager.
1 parent ded56b1 commit f50e8ea

File tree

6 files changed

+145
-53
lines changed

6 files changed

+145
-53
lines changed

README.rst

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,20 @@ Usage
3131
Creating RDF from DataFrame
3232
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3333

34+
As of version 1.1.0 NamespaceManager can be supplied to ``rdflib.to_graph`` for conversion to Graph.
35+
3436
::
3537

3638
import rdfpandas.graph
3739
import pandas as pd
3840
import rdflib
3941
4042
df = pd.read_csv('to_graph_test.csv', index_col = '@id', keep_default_na = False)
41-
g = to_graph(df)
42-
s = g.serialize(format='turtle')
43+
namespace_manager = NamespaceManager(Graph())
44+
namespace_manager.bind('skos', SKOS)
45+
namespace_manager.bind('rdfpandas', Namespace('http://github.com/cadmiumkitty/rdfpandas/'))
46+
g = to_graph(df, namespace_manager)
47+
s = g.serialize(format = 'turtle')
4348

4449
Creating DataFrame from RDF
4550
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

docs/source/conf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,13 @@
2020
# -- Project information -----------------------------------------------------
2121

2222
project = 'rdfpandas'
23-
copyright = '2020, Eugene Morozov'
23+
copyright = '2021, Eugene Morozov'
2424
author = 'Eugene Morozov'
2525

2626
# The short X.Y version
27-
version = 'v1.0.0'
27+
version = 'v1.1.0'
2828
# The full version, including alpha/beta/rc tags
29-
release = 'v1.0.0'
29+
release = 'v1.1.0'
3030

3131

3232
# -- General configuration ---------------------------------------------------

rdfpandas/graph.py

Lines changed: 102 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from rdflib.namespace import NamespaceManager
77
import re
88

9-
def to_graph(df: pd.DataFrame) -> Graph:
9+
def to_graph(df: pd.DataFrame, namespace_manager: NamespaceManager = None) -> Graph:
1010
"""
1111
Takes Pandas DataFrame and returns RDFLib Graph.
1212
Row indices are used as subjects and column indices as predicates.
@@ -21,6 +21,8 @@ def to_graph(df: pd.DataFrame) -> Graph:
2121
----------
2222
df : pandas.DataFrame
2323
DataFrame to be converted into Graph.
24+
namespace_manager : rdflib.namespace.NamespaceManager
25+
NamespaceManager to use to normalize URIs
2426
2527
Returns
2628
-------
@@ -29,19 +31,23 @@ def to_graph(df: pd.DataFrame) -> Graph:
2931
3032
"""
3133

32-
g = Graph()
34+
g = Graph(namespace_manager = namespace_manager)
35+
36+
prefixes = {}
37+
for (prefix, namespace) in g.namespace_manager.namespaces():
38+
prefixes[prefix] = namespace
3339

3440
for (index, series) in df.iterrows():
3541
for (column, value) in series.iteritems():
3642
match = re.search('([\w?:/.]*)(\{(\w*)\})?(\[(\d*)\])?(\(([\w?:/.]*)\))?(@(\w*))?', column)
3743

3844
if pd.notna(value) and pd.notnull(value):
39-
s = _get_identifier(index)
40-
p = _get_identifier(match.group(1))
45+
s = _get_identifier(prefixes, index)
46+
p = _get_identifier(prefixes, match.group(1))
4147
if isinstance(value, bytes):
42-
o = _get_identifier(value.decode('utf-8'), match.group(3), match.group(7), match.group(9))
48+
o = _get_identifier(prefixes, value.decode('utf-8'), match.group(3), match.group(7), match.group(9))
4349
else:
44-
o = _get_identifier(value, match.group(3), match.group(7), match.group(9))
50+
o = _get_identifier(prefixes, value, match.group(3), match.group(7), match.group(9))
4551
g.add((s, p, o))
4652

4753
return g
@@ -110,22 +116,22 @@ def to_dataframe(g: Graph) -> pd.DataFrame:
110116
language = idl[2]
111117
idl_len = predicates[p][idl]
112118
for index in range(idl_len):
113-
series_name = f'{_get_str_for_uri(g.namespace_manager, p)}{{{instance}}}'
119+
series_name = f'{_get_str_for_uriref(g.namespace_manager, p)}{{{instance}}}'
114120
if idl_len > 1:
115121
series_name = ''.join([series_name, f'[{index}]'])
116122
if datatype:
117-
series_name = ''.join([series_name, f'({_get_str_for_uri(g.namespace_manager, datatype)})'])
123+
series_name = ''.join([series_name, f'({_get_str_for_uriref(g.namespace_manager, datatype)})'])
118124
if language:
119125
series_name = ''.join([series_name, f'@{language}'])
120126
p_subjects = []
121127
p_objects = []
122128
if idls_len == 1 and idl_len == 1:
123129
for s, o in sorted(g.subject_objects(p)):
124-
p_subjects.append(_get_str_for_uri(g.namespace_manager, s))
130+
p_subjects.append(_get_str_for_uriref(g.namespace_manager, s))
125131
if isinstance(o, Literal):
126132
p_objects.append(str(o))
127133
else:
128-
p_objects.append(_get_str_for_uri(g.namespace_manager, o))
134+
p_objects.append(_get_str_for_uriref(g.namespace_manager, o))
129135
else:
130136
s_index = 0
131137
last_seen_subject = None
@@ -135,25 +141,27 @@ def to_dataframe(g: Graph) -> pd.DataFrame:
135141
o_idl = _get_idl_for_identifier(o)
136142
if o_idl == idl:
137143
if s_index == index:
138-
p_subjects.append(_get_str_for_uri(g.namespace_manager, s))
144+
p_subjects.append(_get_str_for_uriref(g.namespace_manager, s))
139145
if isinstance(o, Literal):
140146
p_objects.append(str(o))
141147
else:
142-
p_objects.append(_get_str_for_uri(g.namespace_manager, o))
148+
p_objects.append(_get_str_for_uriref(g.namespace_manager, o))
143149
s_index = s_index + 1
144150
last_seen_subject = s
145151
series[series_name] = pd.Series(data = p_objects, index = p_subjects, dtype = np.unicode_)
146152

147153
return pd.DataFrame(series)
148154

149-
def _get_identifier(value: object, instance: str = None, datatype: str = None, language: str = None) -> Identifier:
155+
def _get_identifier(prefixes: dict, value: object, instance: str = None, datatype: str = None, language: str = None) -> Identifier:
150156
"""
151157
Takes value extracted from the index, column or cell and returns
152158
an instance of Identifier (Literal, URIRef or BNode) using correct
153159
datatype and language.
154160
155161
Parameters
156162
----------
163+
prefixes : dict
164+
Prefixes to use to normalize URIs
157165
value : object
158166
Value of index, column or cell
159167
instance : str
@@ -176,19 +184,32 @@ def _get_identifier(value: object, instance: str = None, datatype: str = None, l
176184
return Literal(value, lang = language)
177185
elif datatype:
178186
return Literal(value, datatype = URIRef(datatype))
179-
elif re.match('^\w*:\w*$', str(value)) or re.match('^http[s]?://.*$', str(value)):
187+
elif _is_uri(value):
180188
return URIRef(value)
189+
elif _is_curie(value):
190+
return _get_uriref_for_curie(prefixes, value)
181191
else:
182192
return Literal(value)
183193
elif instance == Literal.__name__:
184194
if language:
185195
return Literal(value, lang = language)
186196
elif datatype:
187-
return Literal(value, datatype = URIRef(datatype))
197+
if _is_uri(datatype):
198+
datatype_uriref = URIRef(datatype)
199+
elif _is_curie(datatype):
200+
datatype_uriref = _get_uriref_for_curie(prefixes, datatype)
201+
else:
202+
ValueError(f'Not a valid URI for datatype {datatype}')
203+
return Literal(value, datatype = datatype_uriref)
188204
else:
189205
return Literal(value)
190206
elif instance == URIRef.__name__:
191-
return URIRef(value)
207+
if _is_uri(value):
208+
return URIRef(value)
209+
elif _is_curie(value):
210+
return _get_uriref_for_curie(prefixes, value)
211+
else:
212+
ValueError(f'Not a valid URI {value}')
192213
elif instance == BNode.__name__:
193214
return BNode(value)
194215

@@ -220,7 +241,7 @@ def _get_idl_for_identifier(i: Identifier) -> tuple:
220241

221242
return (instance, datatype, language)
222243

223-
def _get_str_for_uri(namespace_manager: NamespaceManager, uri: URIRef) -> str:
244+
def _get_str_for_uriref(namespace_manager: NamespaceManager, uriref: URIRef) -> str:
224245
"""
225246
Reusing NamespaceManager.normalizeUri for transforming Graph to DataFrame.
226247
In effect we only need to strip < and > from N3 representation and
@@ -230,8 +251,8 @@ def _get_str_for_uri(namespace_manager: NamespaceManager, uri: URIRef) -> str:
230251
----------
231252
namespace_manager : rdflib.namespace.NamespaceManager
232253
NamespaceManager to use to normalize URIs
233-
uri : rdflib.URIRef
234-
NamespaceManager to use to normalize URIs
254+
uriref : rdflib.URIRef
255+
URI to normalize
235256
236257
Returns
237258
-------
@@ -240,4 +261,65 @@ def _get_str_for_uri(namespace_manager: NamespaceManager, uri: URIRef) -> str:
240261
241262
"""
242263

243-
return re.sub('<|>', '', namespace_manager.normalizeUri(uri))
264+
return re.sub('<|>', '', namespace_manager.normalizeUri(uriref))
265+
266+
def _get_uriref_for_curie(prefixes: dict, value: object) -> URIRef:
267+
"""
268+
Converts curie string into URIRef with fully qualified URI.
269+
270+
Parameters
271+
----------
272+
prefixes : dict
273+
Prefixes to use to normalize URIs
274+
value : object
275+
Value from DataFrame to be converted to URIRef.
276+
277+
Returns
278+
-------
279+
rdflib.URIRef
280+
URIRef created from the string.
281+
282+
"""
283+
284+
prefix, name = value.split(':')
285+
if prefix in prefixes:
286+
return URIRef(''.join((prefixes[prefix], name)))
287+
else:
288+
return URIRef(value)
289+
290+
def _is_curie(value: object) -> bool:
291+
"""
292+
Checks if value from DataFrame is a CURIE.
293+
294+
Parameters
295+
----------
296+
value : object
297+
Value from DataFrame to be checked.
298+
299+
Returns
300+
-------
301+
bool
302+
True if value is matching CURIE pattern, false otherwise.
303+
304+
"""
305+
306+
return re.match('^\w*:\w*$', str(value))
307+
308+
def _is_uri(value: object) -> bool:
309+
"""
310+
Checks if value from DataFrame is a URI.
311+
312+
Parameters
313+
----------
314+
value : object
315+
Value from DataFrame to be checked.
316+
317+
Returns
318+
-------
319+
bool
320+
True if value is matching URI pattern, false otherwise.
321+
322+
"""
323+
324+
return re.match('^http[s]?://.*$', str(value))
325+

setup.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,15 @@
1010
license = f.read()
1111

1212
setup(
13-
name='rdfpandas',
14-
version='1.0.0',
15-
description='RDF support for Pandas',
16-
long_description=readme,
17-
author='Eugene Morozov',
18-
author_email='emorozov@gmail.com',
19-
url='https://github.com/cadmiumkitty/rdfpandas',
20-
license='MIT',
21-
packages=find_packages(exclude=('tests', 'docs'))
13+
name = 'rdfpandas',
14+
version = '1.1.0',
15+
description = 'RDF support for Pandas',
16+
long_description = readme,
17+
author = 'Eugene Morozov',
18+
author_email = 'emorozov@gmail.com',
19+
url = 'https://github.com/cadmiumkitty/rdfpandas',
20+
license = 'MIT',
21+
packages = find_packages(exclude = ('tests', 'docs')),
22+
install_requires = ['pandas>=1.2.0', 'rdflib>=5.0.0']
2223
)
2324

tests/rdf/test.ttl

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,24 @@
11
@prefix rdfpandas: <http://github.com/cadmiumkitty/rdfpandas/> .
22
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
3+
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
34

4-
<rdfpandas:one> <rdfpandas:curie> <skos:broader> ;
5-
<rdfpandas:double> "10.0"^^<xsd:double> ;
6-
<rdfpandas:integer> "10"^^<xsd:integer> ;
7-
<rdfpandas:string> "String 1",
5+
rdfpandas:one rdfpandas:curie skos:broader ;
6+
rdfpandas:double "10.0"^^xsd:double ;
7+
rdfpandas:integer "10"^^xsd:integer ;
8+
rdfpandas:string "String 1",
89
"String in English 1 (1)"@en,
910
"String in English 2 (1)"@en,
1011
"String in Nepali 1 (1)"@ne,
1112
"String in Russian 1 (1)"@ru,
12-
"String with type 1 (1)"^^<xsd:string>,
13-
"String with type 2 (1)"^^<xsd:string> ;
14-
<rdfpandas:uri> <https://google.com> .
13+
"String with type 1 (1)"^^xsd:string,
14+
"String with type 2 (1)"^^xsd:string ;
15+
rdfpandas:uri <https://google.com> .
1516

16-
<rdfpandas:two> <rdfpandas:anotherstring> "String 2" ;
17-
<rdfpandas:double> "20.0"^^<xsd:double> ;
18-
<rdfpandas:integer> "20"^^<xsd:integer> ;
19-
<rdfpandas:string> "String in English 1 (2)"@en,
17+
rdfpandas:two rdfpandas:anotherstring "String 2" ;
18+
rdfpandas:double "20.0"^^xsd:double ;
19+
rdfpandas:integer "20"^^xsd:integer ;
20+
rdfpandas:string "String in English 1 (2)"@en,
2021
"String in Nepali 1 (2)"@ne,
2122
"String in Nepali 2 (2)"@ne,
22-
"String with type 1 (2)"^^<xsd:string>,
23-
"String with type 2 (2)"^^<xsd:string> .
23+
"String with type 1 (2)"^^xsd:string,
24+
"String with type 2 (2)"^^xsd:string .

tests/test_graph.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
import pandas as pd
66
import numpy as np
77

8-
from rdflib import Graph, Literal, URIRef, BNode
8+
from rdflib import Graph, Literal, URIRef, BNode, Namespace
99
from rdflib.term import Identifier
10-
from rdflib.namespace import NamespaceManager
10+
from rdflib.namespace import NamespaceManager, SKOS, XSD
1111
import rdflib.compare
1212

1313
import unittest
@@ -156,7 +156,7 @@ def test_should_convert_data_frame_to_graph_literal(self):
156156
Literal('String')))
157157
g_expected.add((URIRef('http://github.com/cadmiumkitty/rdfpandas/one'),
158158
URIRef('http://github.com/cadmiumkitty/rdfpandas/string'),
159-
Literal('String with type only', datatype = URIRef('xsd:string'))))
159+
Literal('String with type only', datatype = XSD.string)))
160160
g_expected.add((URIRef('http://github.com/cadmiumkitty/rdfpandas/one'),
161161
URIRef('http://github.com/cadmiumkitty/rdfpandas/string'),
162162
Literal('String with language only in Nepali', lang = 'ne')))
@@ -334,7 +334,10 @@ def test_should_roundtrip_csv_to_graph_to_csv(self):
334334
"""
335335

336336
df = pd.read_csv('./csv/test.csv', index_col = '@id', keep_default_na = True)
337-
g = rdfpandas.to_graph(df)
337+
namespace_manager = NamespaceManager(Graph())
338+
namespace_manager.bind('skos', SKOS)
339+
namespace_manager.bind('rdfpandas', Namespace('http://github.com/cadmiumkitty/rdfpandas/'))
340+
g = rdfpandas.to_graph(df, namespace_manager)
338341
df_result = rdfpandas.to_dataframe(g)
339342

340343
pd.testing.assert_frame_equal(df.astype(np.unicode_), df_result.astype(np.unicode_), check_like = True, check_names = False)
@@ -347,8 +350,8 @@ def test_should_roundtrip_graph_to_csv_to_graph(self):
347350
g = rdflib.Graph()
348351
g.parse('./rdf/test.ttl', format = 'ttl')
349352
df = rdfpandas.to_dataframe(g)
350-
g_result = rdfpandas.to_graph(df)
351-
353+
print(df.T)
354+
g_result = rdfpandas.to_graph(df, g.namespace_manager)
352355
self.assertEquals(rdflib.compare.isomorphic(g, g_result), True)
353356

354357

0 commit comments

Comments
 (0)