66from rdflib .namespace import NamespaceManager
77import re
88
9- def to_graph (df : pd .DataFrame ) -> Graph :
9+ def to_graph (df : pd .DataFrame , namespace_manager : NamespaceManager = None ) -> Graph :
1010 """
1111 Takes Pandas DataFrame and returns RDFLib Graph.
1212 Row indices are used as subjects and column indices as predicates.
@@ -21,6 +21,8 @@ def to_graph(df: pd.DataFrame) -> Graph:
2121 ----------
2222 df : pandas.DataFrame
2323 DataFrame to be converted into Graph.
24+ namespace_manager : rdflib.namespace.NamespaceManager
25+ NamespaceManager to use to normalize URIs
2426
2527 Returns
2628 -------
@@ -29,19 +31,23 @@ def to_graph(df: pd.DataFrame) -> Graph:
2931
3032 """
3133
32- g = Graph ()
34+ g = Graph (namespace_manager = namespace_manager )
35+
36+ prefixes = {}
37+ for (prefix , namespace ) in g .namespace_manager .namespaces ():
38+ prefixes [prefix ] = namespace
3339
3440 for (index , series ) in df .iterrows ():
3541 for (column , value ) in series .iteritems ():
3642 match = re .search ('([\w?:/.]*)(\{(\w*)\})?(\[(\d*)\])?(\(([\w?:/.]*)\))?(@(\w*))?' , column )
3743
3844 if pd .notna (value ) and pd .notnull (value ):
39- s = _get_identifier (index )
40- p = _get_identifier (match .group (1 ))
45+ s = _get_identifier (prefixes , index )
46+ p = _get_identifier (prefixes , match .group (1 ))
4147 if isinstance (value , bytes ):
42- o = _get_identifier (value .decode ('utf-8' ), match .group (3 ), match .group (7 ), match .group (9 ))
48+ o = _get_identifier (prefixes , value .decode ('utf-8' ), match .group (3 ), match .group (7 ), match .group (9 ))
4349 else :
44- o = _get_identifier (value , match .group (3 ), match .group (7 ), match .group (9 ))
50+ o = _get_identifier (prefixes , value , match .group (3 ), match .group (7 ), match .group (9 ))
4551 g .add ((s , p , o ))
4652
4753 return g
@@ -110,22 +116,22 @@ def to_dataframe(g: Graph) -> pd.DataFrame:
110116 language = idl [2 ]
111117 idl_len = predicates [p ][idl ]
112118 for index in range (idl_len ):
113- series_name = f'{ _get_str_for_uri (g .namespace_manager , p )} {{{ instance } }}'
119+ series_name = f'{ _get_str_for_uriref (g .namespace_manager , p )} {{{ instance } }}'
114120 if idl_len > 1 :
115121 series_name = '' .join ([series_name , f'[{ index } ]' ])
116122 if datatype :
117- series_name = '' .join ([series_name , f'({ _get_str_for_uri (g .namespace_manager , datatype )} )' ])
123+ series_name = '' .join ([series_name , f'({ _get_str_for_uriref (g .namespace_manager , datatype )} )' ])
118124 if language :
119125 series_name = '' .join ([series_name , f'@{ language } ' ])
120126 p_subjects = []
121127 p_objects = []
122128 if idls_len == 1 and idl_len == 1 :
123129 for s , o in sorted (g .subject_objects (p )):
124- p_subjects .append (_get_str_for_uri (g .namespace_manager , s ))
130+ p_subjects .append (_get_str_for_uriref (g .namespace_manager , s ))
125131 if isinstance (o , Literal ):
126132 p_objects .append (str (o ))
127133 else :
128- p_objects .append (_get_str_for_uri (g .namespace_manager , o ))
134+ p_objects .append (_get_str_for_uriref (g .namespace_manager , o ))
129135 else :
130136 s_index = 0
131137 last_seen_subject = None
@@ -135,25 +141,27 @@ def to_dataframe(g: Graph) -> pd.DataFrame:
135141 o_idl = _get_idl_for_identifier (o )
136142 if o_idl == idl :
137143 if s_index == index :
138- p_subjects .append (_get_str_for_uri (g .namespace_manager , s ))
144+ p_subjects .append (_get_str_for_uriref (g .namespace_manager , s ))
139145 if isinstance (o , Literal ):
140146 p_objects .append (str (o ))
141147 else :
142- p_objects .append (_get_str_for_uri (g .namespace_manager , o ))
148+ p_objects .append (_get_str_for_uriref (g .namespace_manager , o ))
143149 s_index = s_index + 1
144150 last_seen_subject = s
145151 series [series_name ] = pd .Series (data = p_objects , index = p_subjects , dtype = np .unicode_ )
146152
147153 return pd .DataFrame (series )
148154
149- def _get_identifier (value : object , instance : str = None , datatype : str = None , language : str = None ) -> Identifier :
155+ def _get_identifier (prefixes : dict , value : object , instance : str = None , datatype : str = None , language : str = None ) -> Identifier :
150156 """
151157 Takes value extracted from the index, column or cell and returns
152158 an instance of Identifier (Literal, URIRef or BNode) using correct
153159 datatype and language.
154160
155161 Parameters
156162 ----------
163+ prefixes : dict
164+ Prefixes to use to normalize URIs
157165 value : object
158166 Value of index, column or cell
159167 instance : str
@@ -176,19 +184,32 @@ def _get_identifier(value: object, instance: str = None, datatype: str = None, l
176184 return Literal (value , lang = language )
177185 elif datatype :
178186 return Literal (value , datatype = URIRef (datatype ))
179- elif re . match ( '^\w*:\w*$' , str ( value )) or re . match ( '^http[s]?://.*$' , str ( value ) ):
187+ elif _is_uri ( value ):
180188 return URIRef (value )
189+ elif _is_curie (value ):
190+ return _get_uriref_for_curie (prefixes , value )
181191 else :
182192 return Literal (value )
183193 elif instance == Literal .__name__ :
184194 if language :
185195 return Literal (value , lang = language )
186196 elif datatype :
187- return Literal (value , datatype = URIRef (datatype ))
197+ if _is_uri (datatype ):
198+ datatype_uriref = URIRef (datatype )
199+ elif _is_curie (datatype ):
200+ datatype_uriref = _get_uriref_for_curie (prefixes , datatype )
201+ else :
202+ ValueError (f'Not a valid URI for datatype { datatype } ' )
203+ return Literal (value , datatype = datatype_uriref )
188204 else :
189205 return Literal (value )
190206 elif instance == URIRef .__name__ :
191- return URIRef (value )
207+ if _is_uri (value ):
208+ return URIRef (value )
209+ elif _is_curie (value ):
210+ return _get_uriref_for_curie (prefixes , value )
211+ else :
212+ ValueError (f'Not a valid URI { value } ' )
192213 elif instance == BNode .__name__ :
193214 return BNode (value )
194215
@@ -220,7 +241,7 @@ def _get_idl_for_identifier(i: Identifier) -> tuple:
220241
221242 return (instance , datatype , language )
222243
223- def _get_str_for_uri (namespace_manager : NamespaceManager , uri : URIRef ) -> str :
244+ def _get_str_for_uriref (namespace_manager : NamespaceManager , uriref : URIRef ) -> str :
224245 """
225246 Reusing NamespaceManager.normalizeUri for transforming Graph to DataFrame.
226247 In effect we only need to strip < and > from N3 representation and
@@ -230,8 +251,8 @@ def _get_str_for_uri(namespace_manager: NamespaceManager, uri: URIRef) -> str:
230251 ----------
231252 namespace_manager : rdflib.namespace.NamespaceManager
232253 NamespaceManager to use to normalize URIs
233- uri : rdflib.URIRef
234- NamespaceManager to use to normalize URIs
254+ uriref : rdflib.URIRef
255+ URI to normalize
235256
236257 Returns
237258 -------
@@ -240,4 +261,65 @@ def _get_str_for_uri(namespace_manager: NamespaceManager, uri: URIRef) -> str:
240261
241262 """
242263
243- return re .sub ('<|>' , '' , namespace_manager .normalizeUri (uri ))
264+ return re .sub ('<|>' , '' , namespace_manager .normalizeUri (uriref ))
265+
266+ def _get_uriref_for_curie (prefixes : dict , value : object ) -> URIRef :
267+ """
268+ Converts curie string into URIRef with fully qualified URI.
269+
270+ Parameters
271+ ----------
272+ prefixes : dict
273+ Prefixes to use to normalize URIs
274+ value : object
275+ Value from DataFrame to be converted to URIRef.
276+
277+ Returns
278+ -------
279+ rdflib.URIRef
280+ URIRef created from the string.
281+
282+ """
283+
284+ prefix , name = value .split (':' )
285+ if prefix in prefixes :
286+ return URIRef ('' .join ((prefixes [prefix ], name )))
287+ else :
288+ return URIRef (value )
289+
290+ def _is_curie (value : object ) -> bool :
291+ """
292+ Checks if value from DataFrame is a CURIE.
293+
294+ Parameters
295+ ----------
296+ value : object
297+ Value from DataFrame to be checked.
298+
299+ Returns
300+ -------
301+ bool
302+ True if value is matching CURIE pattern, false otherwise.
303+
304+ """
305+
306+ return re .match ('^\w*:\w*$' , str (value ))
307+
308+ def _is_uri (value : object ) -> bool :
309+ """
310+ Checks if value from DataFrame is a URI.
311+
312+ Parameters
313+ ----------
314+ value : object
315+ Value from DataFrame to be checked.
316+
317+ Returns
318+ -------
319+ bool
320+ True if value is matching URI pattern, false otherwise.
321+
322+ """
323+
324+ return re .match ('^http[s]?://.*$' , str (value ))
325+
0 commit comments