11# -*- coding: utf-8 -*-
22import pandas as pd
3- import rdflib
4- import logging
3+ import numpy as np
4+ from rdflib import Graph , Literal , URIRef , BNode
5+ from rdflib .term import Identifier
6+ from rdflib .namespace import NamespaceManager
7+ import re
58
6-
7- def to_graph (df : pd .DataFrame ) -> rdflib .Graph :
9+ def to_graph (df : pd .DataFrame ) -> Graph :
810 """
9- Takes Pandas DataFrame and returns RDFLib Graph using row indices as subjects
10- and column indices as predictes. Object types are inferred from the Series
11- content type.
11+ Takes Pandas DataFrame and returns RDFLib Graph.
12+ Row indices are used as subjects and column indices as predicates.
13+ Object types are inferred from the column index pattern of
14+ "predicate{rdfLib Identifier instance class name}(type)[index]@language".
15+ Index numbers simply create additoinal statements as opposed
16+ to attempting to construct a new rdfs:List or rdfs:Container.
17+ Namespaces need to be bound by the user of the method prior
18+ to serialization.
1219
1320 Parameters
1421 ----------
@@ -18,21 +25,219 @@ def to_graph(df: pd.DataFrame) -> rdflib.Graph:
1825 Returns
1926 -------
2027 rdflib.Graph
21- Graph created from Pandas DataFrame
28+ Graph created from Pandas DataFrame.
2229
2330 """
2431
25- g = rdflib . Graph ()
26-
32+ g = Graph ()
33+
2734 for (index , series ) in df .iterrows ():
2835 for (column , value ) in series .iteritems ():
29- if (type (value ) == 'bytes' ):
30- g .add ((rdflib .URIRef (index ),
31- rdflib .URIRef (column ),
32- rdflib .Literal (value .decode ('utf-8' ))))
33- else :
34- g .add ((rdflib .URIRef (index ),
35- rdflib .URIRef (column ),
36- rdflib .Literal (value )))
37-
36+ match = re .search ('([\w?:/.]*)(\{(\w*)\})?(\[(\d*)\])?(\(([\w?:/.]*)\))?(@(\w*))?' , column )
37+
38+ if pd .notna (value ) and pd .notnull (value ):
39+ s = _get_identifier (index )
40+ p = _get_identifier (match .group (1 ))
41+ if isinstance (value , bytes ):
42+ o = _get_identifier (value .decode ('utf-8' ), match .group (3 ), match .group (7 ), match .group (9 ))
43+ else :
44+ o = _get_identifier (value , match .group (3 ), match .group (7 ), match .group (9 ))
45+ g .add ((s , p , o ))
46+
3847 return g
48+
49+
50+ def to_dataframe (g : Graph ) -> pd .DataFrame :
51+ """
52+ Takes rdfLib Graph object and creates Pandas DataFrame.
53+ Indices are subjects and attempt is made to construct CURIEs
54+ using namespace manager of the rdfLib Graph.
55+ Columns are predicates and attempt is made to construct CURIEs
56+ using namespace manager of the rdfLib Graph, similar to indices.
57+ Column names are created using
58+ "predicate{rdfLib Identifier instance class name}(type)[index]@language"
59+ pattern to allow for round trip conversion.
60+ Multiple objects for the same subject and predicate
61+ result in columns with index in its name.
62+ No attemps are made at type conversion, all objects are strings in the
63+ DataFrame.
64+
65+ Parameters
66+ ----------
67+ g : rdflib.Graph
68+ rdfLib Graph.
69+
70+ Returns
71+ -------
72+ pd.DataFrame
73+ Pandas DataFrame created from rdfLib Graph.
74+
75+ """
76+
77+ subjects = {}
78+ predicates = {}
79+
80+ for s in g .subjects ():
81+ if s not in subjects :
82+ subjects [s ] = s
83+ s_predicates = {}
84+ for p , o in sorted (g .predicate_objects (s )):
85+ idl = _get_idl_for_identifier (o )
86+ if p in s_predicates :
87+ if idl in s_predicates [p ]:
88+ s_predicates [p ][idl ] = s_predicates [p ][idl ] + 1
89+ else :
90+ s_predicates [p ][idl ] = 1
91+ else :
92+ s_predicates [p ] = {idl : 1 }
93+ for p in s_predicates :
94+ if p in predicates :
95+ for idl in s_predicates [p ]:
96+ if idl in predicates [p ]:
97+ predicates [p ][idl ] = max (s_predicates [p ][idl ], predicates [p ][idl ])
98+ else :
99+ predicates [p ][idl ] = s_predicates [p ][idl ]
100+ else :
101+ predicates [p ] = s_predicates [p ]
102+
103+ series = {}
104+
105+ for p in predicates :
106+ idls_len = len (predicates [p ])
107+ for idl in predicates [p ]:
108+ instance = idl [0 ]
109+ datatype = idl [1 ]
110+ language = idl [2 ]
111+ idl_len = predicates [p ][idl ]
112+ for index in range (idl_len ):
113+ series_name = f'{ _get_str_for_uri (g .namespace_manager , p )} {{{ instance } }}'
114+ if idl_len > 1 :
115+ series_name = '' .join ([series_name , f'[{ index } ]' ])
116+ if datatype :
117+ series_name = '' .join ([series_name , f'({ _get_str_for_uri (g .namespace_manager , datatype )} )' ])
118+ if language :
119+ series_name = '' .join ([series_name , f'@{ language } ' ])
120+ p_subjects = []
121+ p_objects = []
122+ if idls_len == 1 and idl_len == 1 :
123+ for s , o in sorted (g .subject_objects (p )):
124+ p_subjects .append (_get_str_for_uri (g .namespace_manager , s ))
125+ if isinstance (o , Literal ):
126+ p_objects .append (str (o ))
127+ else :
128+ p_objects .append (_get_str_for_uri (g .namespace_manager , o ))
129+ else :
130+ s_index = 0
131+ last_seen_subject = None
132+ for s , o in sorted (g .subject_objects (p )):
133+ if s != last_seen_subject :
134+ s_index = 0
135+ o_idl = _get_idl_for_identifier (o )
136+ if o_idl == idl :
137+ if s_index == index :
138+ p_subjects .append (_get_str_for_uri (g .namespace_manager , s ))
139+ if isinstance (o , Literal ):
140+ p_objects .append (str (o ))
141+ else :
142+ p_objects .append (_get_str_for_uri (g .namespace_manager , o ))
143+ s_index = s_index + 1
144+ last_seen_subject = s
145+ series [series_name ] = pd .Series (data = p_objects , index = p_subjects , dtype = np .unicode_ )
146+
147+ return pd .DataFrame (series )
148+
149+ def _get_identifier (value : object , instance : str = None , datatype : str = None , language : str = None ) -> Identifier :
150+ """
151+ Takes value extracted from the index, column or cell and returns
152+ an instance of Identifier (Literal, URIRef or BNode) using correct
153+ datatype and language.
154+
155+ Parameters
156+ ----------
157+ value : object
158+ Value of index, column or cell
159+ instance : str
160+ Name of the rdfLib Identifier class to use
161+ datatype : str
162+ Datatype of rdfLib Literal to use
163+ (see https://rdflib.readthedocs.io/en/stable/rdf_terms.html#python-support)
164+ language : str
165+ Language of rdfLib Literal to use
166+
167+ Returns
168+ -------
169+ rdflib.term.Identifier
170+ rdflib.term.Identifier instance - either URIRef or Literal.
171+
172+ """
173+
174+ if not instance :
175+ if language :
176+ return Literal (value , lang = language )
177+ elif datatype :
178+ return Literal (value , datatype = URIRef (datatype ))
179+ elif re .match ('^\w*:\w*$' , str (value )) or re .match ('^http[s]?://.*$' , str (value )):
180+ return URIRef (value )
181+ else :
182+ return Literal (value )
183+ elif instance == Literal .__name__ :
184+ if language :
185+ return Literal (value , lang = language )
186+ elif datatype :
187+ return Literal (value , datatype = URIRef (datatype ))
188+ else :
189+ return Literal (value )
190+ elif instance == URIRef .__name__ :
191+ return URIRef (value )
192+ elif instance == BNode .__name__ :
193+ return BNode (value )
194+
195+ raise ValueError (f'Can only create Literal, URIRef or BNode but was { instance } ' )
196+
197+ def _get_idl_for_identifier (i : Identifier ) -> tuple :
198+ """
199+ Takes rdfLib Identifier, and returns a tuple of
200+ instance name (Literal, URIRef or BNode), datatype (XSD type) and language.
201+
202+ Parameters
203+ ----------
204+ i : rdflib.term.Identifier
205+ rdfLib Identifier (parent of BNode, Literal or URIRef).
206+
207+ Returns
208+ -------
209+ tuple
210+ tuple of instance, datatype and language.
211+
212+ """
213+
214+ instance = i .__class__ .__name__
215+ datatype = None
216+ language = None
217+ if isinstance (i , Literal ):
218+ datatype = i .datatype
219+ language = i .language
220+
221+ return (instance , datatype , language )
222+
223+ def _get_str_for_uri (namespace_manager : NamespaceManager , uri : URIRef ) -> str :
224+ """
225+ Reusing NamespaceManager.normalizeUri for transforming Graph to DataFrame.
226+ In effect we only need to strip < and > from N3 representation and
227+ forget the case of URIRef being a rdflib.term.Variable.
228+
229+ Parameters
230+ ----------
231+ namespace_manager : rdflib.namespace.NamespaceManager
232+ NamespaceManager to use to normalize URIs
233+ uri : rdflib.URIRef
234+ NamespaceManager to use to normalize URIs
235+
236+ Returns
237+ -------
238+ str
239+ Normalised URI string.
240+
241+ """
242+
243+ return re .sub ('<|>' , '' , namespace_manager .normalizeUri (uri ))
0 commit comments