Skip to content

Commit ded56b1

Browse files
committed
Improves DataFrame to RDF conversion, adds support for RDF to DataFrame conversion, updates dependencies.
1 parent fca4e22 commit ded56b1

File tree

12 files changed

+646
-144
lines changed

12 files changed

+646
-144
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,3 +97,7 @@ venv.bak/
9797

9898
# mypy
9999
.mypy_cache/
100+
101+
### VisualStudio Code ###
102+
103+
.vscode/

README.rst

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,9 @@
11
RdfPandas
22
=========
33

4-
RdfPandas is a module providing RDF support for Pandas. It consists initially
5-
of a simple function for graph conversion to create RDFLib Graph data from
6-
Pandas DataFrame.
7-
8-
The graph data can then be serialized using RDFLib serialize method on the
9-
graph.
4+
RdfPandas is a module providing RDF support for Pandas. It consists of
5+
two simple functions for Graph to DataFrame conversion and
6+
DataFrame to Graph conversion.
107

118
Getting Started
129
---------------
@@ -31,7 +28,7 @@ Installation
3128
Usage
3229
-----
3330

34-
Getting RDF out of the DataFrame
31+
Creating RDF from DataFrame
3532
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3633

3734
::
@@ -40,6 +37,27 @@ Getting RDF out of the DataFrame
4037
import pandas as pd
4138
import rdflib
4239
43-
df = pd.DataFrame()
40+
df = pd.read_csv('to_graph_test.csv', index_col = '@id', keep_default_na = False)
4441
g = to_graph(df)
4542
s = g.serialize(format='turtle')
43+
44+
Creating DataFrame from RDF
45+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
46+
47+
::
48+
49+
import rdfpandas.graph
50+
import pandas as pd
51+
import rdflib
52+
53+
g = rdflib.Graph()
54+
g.parse('to_df_test.ttl', format = 'ttl')
55+
df = to_dataframe(g)
56+
df.to_csv('test.csv', index = True, index_label = "@id")
57+
58+
Gotchas
59+
-------
60+
61+
No special effort is made for dealing with types, so please be aware of Pandas
62+
features such as https://pandas.pydata.org/pandas-docs/stable/user_guide/gotchas.html#support-for-integer-na
63+
that may result in surprising RDF statements like ``"10.0"^^<xsd:integer>``.

docs/source/conf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,13 @@
2020
# -- Project information -----------------------------------------------------
2121

2222
project = 'rdfpandas'
23-
copyright = '2018, Eugene Morozov'
23+
copyright = '2020, Eugene Morozov'
2424
author = 'Eugene Morozov'
2525

2626
# The short X.Y version
27-
version = 'v0.1.1'
27+
version = 'v1.0.0'
2828
# The full version, including alpha/beta/rc tags
29-
release = 'v0.1.1'
29+
release = 'v1.0.0'
3030

3131

3232
# -- General configuration ---------------------------------------------------

docs/source/index.rst

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,9 @@ RdfPandas
1515
Introduction
1616
============
1717

18-
RdfPandas is a module providing RDF support for Pandas. It consists initially
19-
of a simple function for graph conversion to create RDFLib Graph data from
20-
Pandas DataFrame.
21-
22-
The graph data can then be serialized using RDFLib serialize method on the
23-
graph.
18+
RdfPandas is a module providing RDF support for Pandas. It consists of
19+
two simple functions for Graph to DataFrame conversion and
20+
DataFrame to Graph conversion.
2421

2522
For more details and the source code see https://github.com/cadmiumkitty/rdfpandas/
2623

rdfpandas/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
from .graph import to_graph
1+
from .graph import to_graph, to_dataframe

rdfpandas/graph.py

Lines changed: 224 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,21 @@
11
# -*- coding: utf-8 -*-
22
import pandas as pd
3-
import rdflib
4-
import logging
3+
import numpy as np
4+
from rdflib import Graph, Literal, URIRef, BNode
5+
from rdflib.term import Identifier
6+
from rdflib.namespace import NamespaceManager
7+
import re
58

6-
7-
def to_graph(df: pd.DataFrame) -> rdflib.Graph:
9+
def to_graph(df: pd.DataFrame) -> Graph:
810
"""
9-
Takes Pandas DataFrame and returns RDFLib Graph using row indices as subjects
10-
and column indices as predictes. Object types are inferred from the Series
11-
content type.
11+
Takes Pandas DataFrame and returns RDFLib Graph.
12+
Row indices are used as subjects and column indices as predicates.
13+
Object types are inferred from the column index pattern of
14+
"predicate{rdfLib Identifier instance class name}(type)[index]@language".
15+
Index numbers simply create additoinal statements as opposed
16+
to attempting to construct a new rdfs:List or rdfs:Container.
17+
Namespaces need to be bound by the user of the method prior
18+
to serialization.
1219
1320
Parameters
1421
----------
@@ -18,21 +25,219 @@ def to_graph(df: pd.DataFrame) -> rdflib.Graph:
1825
Returns
1926
-------
2027
rdflib.Graph
21-
Graph created from Pandas DataFrame
28+
Graph created from Pandas DataFrame.
2229
2330
"""
2431

25-
g = rdflib.Graph()
26-
32+
g = Graph()
33+
2734
for (index, series) in df.iterrows():
2835
for (column, value) in series.iteritems():
29-
if (type(value) == 'bytes'):
30-
g.add((rdflib.URIRef(index),
31-
rdflib.URIRef(column),
32-
rdflib.Literal(value.decode('utf-8'))))
33-
else:
34-
g.add((rdflib.URIRef(index),
35-
rdflib.URIRef(column),
36-
rdflib.Literal(value)))
37-
36+
match = re.search('([\w?:/.]*)(\{(\w*)\})?(\[(\d*)\])?(\(([\w?:/.]*)\))?(@(\w*))?', column)
37+
38+
if pd.notna(value) and pd.notnull(value):
39+
s = _get_identifier(index)
40+
p = _get_identifier(match.group(1))
41+
if isinstance(value, bytes):
42+
o = _get_identifier(value.decode('utf-8'), match.group(3), match.group(7), match.group(9))
43+
else:
44+
o = _get_identifier(value, match.group(3), match.group(7), match.group(9))
45+
g.add((s, p, o))
46+
3847
return g
48+
49+
50+
def to_dataframe(g: Graph) -> pd.DataFrame:
51+
"""
52+
Takes rdfLib Graph object and creates Pandas DataFrame.
53+
Indices are subjects and attempt is made to construct CURIEs
54+
using namespace manager of the rdfLib Graph.
55+
Columns are predicates and attempt is made to construct CURIEs
56+
using namespace manager of the rdfLib Graph, similar to indices.
57+
Column names are created using
58+
"predicate{rdfLib Identifier instance class name}(type)[index]@language"
59+
pattern to allow for round trip conversion.
60+
Multiple objects for the same subject and predicate
61+
result in columns with index in its name.
62+
No attemps are made at type conversion, all objects are strings in the
63+
DataFrame.
64+
65+
Parameters
66+
----------
67+
g : rdflib.Graph
68+
rdfLib Graph.
69+
70+
Returns
71+
-------
72+
pd.DataFrame
73+
Pandas DataFrame created from rdfLib Graph.
74+
75+
"""
76+
77+
subjects = {}
78+
predicates = {}
79+
80+
for s in g.subjects():
81+
if s not in subjects:
82+
subjects[s] = s
83+
s_predicates = {}
84+
for p, o in sorted(g.predicate_objects(s)):
85+
idl = _get_idl_for_identifier(o)
86+
if p in s_predicates:
87+
if idl in s_predicates[p]:
88+
s_predicates[p][idl] = s_predicates[p][idl] + 1
89+
else:
90+
s_predicates[p][idl] = 1
91+
else:
92+
s_predicates[p] = {idl: 1}
93+
for p in s_predicates:
94+
if p in predicates:
95+
for idl in s_predicates[p]:
96+
if idl in predicates[p]:
97+
predicates[p][idl] = max(s_predicates[p][idl], predicates[p][idl])
98+
else:
99+
predicates[p][idl] = s_predicates[p][idl]
100+
else:
101+
predicates[p] = s_predicates[p]
102+
103+
series = {}
104+
105+
for p in predicates:
106+
idls_len = len(predicates[p])
107+
for idl in predicates[p]:
108+
instance = idl[0]
109+
datatype = idl[1]
110+
language = idl[2]
111+
idl_len = predicates[p][idl]
112+
for index in range(idl_len):
113+
series_name = f'{_get_str_for_uri(g.namespace_manager, p)}{{{instance}}}'
114+
if idl_len > 1:
115+
series_name = ''.join([series_name, f'[{index}]'])
116+
if datatype:
117+
series_name = ''.join([series_name, f'({_get_str_for_uri(g.namespace_manager, datatype)})'])
118+
if language:
119+
series_name = ''.join([series_name, f'@{language}'])
120+
p_subjects = []
121+
p_objects = []
122+
if idls_len == 1 and idl_len == 1:
123+
for s, o in sorted(g.subject_objects(p)):
124+
p_subjects.append(_get_str_for_uri(g.namespace_manager, s))
125+
if isinstance(o, Literal):
126+
p_objects.append(str(o))
127+
else:
128+
p_objects.append(_get_str_for_uri(g.namespace_manager, o))
129+
else:
130+
s_index = 0
131+
last_seen_subject = None
132+
for s, o in sorted(g.subject_objects(p)):
133+
if s != last_seen_subject:
134+
s_index = 0
135+
o_idl = _get_idl_for_identifier(o)
136+
if o_idl == idl:
137+
if s_index == index:
138+
p_subjects.append(_get_str_for_uri(g.namespace_manager, s))
139+
if isinstance(o, Literal):
140+
p_objects.append(str(o))
141+
else:
142+
p_objects.append(_get_str_for_uri(g.namespace_manager, o))
143+
s_index = s_index + 1
144+
last_seen_subject = s
145+
series[series_name] = pd.Series(data = p_objects, index = p_subjects, dtype = np.unicode_)
146+
147+
return pd.DataFrame(series)
148+
149+
def _get_identifier(value: object, instance: str = None, datatype: str = None, language: str = None) -> Identifier:
150+
"""
151+
Takes value extracted from the index, column or cell and returns
152+
an instance of Identifier (Literal, URIRef or BNode) using correct
153+
datatype and language.
154+
155+
Parameters
156+
----------
157+
value : object
158+
Value of index, column or cell
159+
instance : str
160+
Name of the rdfLib Identifier class to use
161+
datatype : str
162+
Datatype of rdfLib Literal to use
163+
(see https://rdflib.readthedocs.io/en/stable/rdf_terms.html#python-support)
164+
language : str
165+
Language of rdfLib Literal to use
166+
167+
Returns
168+
-------
169+
rdflib.term.Identifier
170+
rdflib.term.Identifier instance - either URIRef or Literal.
171+
172+
"""
173+
174+
if not instance:
175+
if language:
176+
return Literal(value, lang = language)
177+
elif datatype:
178+
return Literal(value, datatype = URIRef(datatype))
179+
elif re.match('^\w*:\w*$', str(value)) or re.match('^http[s]?://.*$', str(value)):
180+
return URIRef(value)
181+
else:
182+
return Literal(value)
183+
elif instance == Literal.__name__:
184+
if language:
185+
return Literal(value, lang = language)
186+
elif datatype:
187+
return Literal(value, datatype = URIRef(datatype))
188+
else:
189+
return Literal(value)
190+
elif instance == URIRef.__name__:
191+
return URIRef(value)
192+
elif instance == BNode.__name__:
193+
return BNode(value)
194+
195+
raise ValueError(f'Can only create Literal, URIRef or BNode but was {instance}')
196+
197+
def _get_idl_for_identifier(i: Identifier) -> tuple:
198+
"""
199+
Takes rdfLib Identifier, and returns a tuple of
200+
instance name (Literal, URIRef or BNode), datatype (XSD type) and language.
201+
202+
Parameters
203+
----------
204+
i : rdflib.term.Identifier
205+
rdfLib Identifier (parent of BNode, Literal or URIRef).
206+
207+
Returns
208+
-------
209+
tuple
210+
tuple of instance, datatype and language.
211+
212+
"""
213+
214+
instance = i.__class__.__name__
215+
datatype = None
216+
language = None
217+
if isinstance(i, Literal):
218+
datatype = i.datatype
219+
language = i.language
220+
221+
return (instance, datatype, language)
222+
223+
def _get_str_for_uri(namespace_manager: NamespaceManager, uri: URIRef) -> str:
224+
"""
225+
Reusing NamespaceManager.normalizeUri for transforming Graph to DataFrame.
226+
In effect we only need to strip < and > from N3 representation and
227+
forget the case of URIRef being a rdflib.term.Variable.
228+
229+
Parameters
230+
----------
231+
namespace_manager : rdflib.namespace.NamespaceManager
232+
NamespaceManager to use to normalize URIs
233+
uri : rdflib.URIRef
234+
NamespaceManager to use to normalize URIs
235+
236+
Returns
237+
-------
238+
str
239+
Normalised URI string.
240+
241+
"""
242+
243+
return re.sub('<|>', '', namespace_manager.normalizeUri(uri))

requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
nose
22
sphinx
33
sphinx_rtd_theme
4-
pandas>=0.22.0
5-
rdflib>=4.2.2
4+
pandas>=1.2.0
5+
rdflib>=5.0.0

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
setup(
1313
name='rdfpandas',
14-
version='0.1.1',
14+
version='1.0.0',
1515
description='RDF support for Pandas',
1616
long_description=readme,
1717
author='Eugene Morozov',

tests/csv/test.csv

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
@id,rdfpandas:curie{URIRef},rdfpandas:double{Literal}(xsd:double),rdfpandas:integer{Literal}(xsd:integer),rdfpandas:string{Literal},rdfpandas:string{Literal}[0]@en,rdfpandas:string{Literal}[1]@en,rdfpandas:string{Literal}[0]@ne,rdfpandas:string{Literal}[1]@ne,rdfpandas:string{Literal}@ru,rdfpandas:string{Literal}[0](xsd:string),rdfpandas:string{Literal}[1](xsd:string),rdfpandas:uri{URIRef},rdfpandas:anotherstring{Literal}
2+
rdfpandas:one,skos:broader,10.0,10,String 1,String in English 1 (1),String in English 2 (1),String in Nepali 1 (1),,String in Russian 1 (1),String with type 1 (1),String with type 2 (1),https://google.com,
3+
rdfpandas:two,,20.0,20,,String in English 1 (2),,String in Nepali 1 (2),String in Nepali 2 (2),,String with type 1 (2),String with type 2 (2),,String 2

0 commit comments

Comments
 (0)