44from __future__ import print_function
55from __future__ import unicode_literals
66
7+ import operator
78import os
89import tarfile
910import typing
1011from collections import OrderedDict
1112from typing import cast , IO
1213
1314import six
15+ from six .moves import map
1416
1517from . import errors
1618from .base import FS
2224from .opener import open_fs
2325from .permissions import Permissions
2426from ._url_tools import url_quote
25- from .path import relpath , basename , isbase , normpath , parts , frombase
27+ from .path import (
28+ dirname ,
29+ join ,
30+ relpath ,
31+ basename ,
32+ isbase ,
33+ normpath ,
34+ parts ,
35+ frombase ,
36+ recursepath ,
37+ relativefrom ,
38+ )
2639from .wrapfs import WrapFS
2740
2841if typing .TYPE_CHECKING :
@@ -255,6 +268,8 @@ class ReadTarFS(FS):
255268 tarfile .SYMTYPE : ResourceType .symlink ,
256269 tarfile .CONTTYPE : ResourceType .file ,
257270 tarfile .LNKTYPE : ResourceType .symlink ,
271+ # this is how we mark implicit directories
272+ tarfile .DIRTYPE + b"i" : ResourceType .directory ,
258273 }
259274
260275 @errors .CreateFailed .catch_all
@@ -275,24 +290,66 @@ def _directory_entries(self):
275290 """Lazy directory cache."""
276291 if self ._directory_cache is None :
277292 _decode = self ._decode
293+ _encode = self ._encode
294+
295+ # collect all directory entries and remove slashes
278296 _directory_entries = (
279297 (_decode (info .name ).strip ("/" ), info ) for info in self ._tar
280298 )
281299
282- def _list_tar ():
283- for name , info in _directory_entries :
284- try :
285- _name = normpath (name )
286- except IllegalBackReference :
287- # Back references outside root, must be up to no good.
288- pass
289- else :
290- if _name :
291- yield _name , info
292-
293- self ._directory_cache = OrderedDict (_list_tar ())
300+ # build the cache first before updating it to reduce chances
301+ # of data races
302+ _cache = OrderedDict ()
303+ for name , info in _directory_entries :
304+ # check for any invalid back references
305+ try :
306+ _name = normpath (name )
307+ except IllegalBackReference :
308+ continue
309+
310+ # add all implicit dirnames if not in the cache already
311+ for partial_name in map (relpath , recursepath (_name )):
312+ dirinfo = tarfile .TarInfo (self ._encode (partial_name ))
313+ dirinfo .type = tarfile .DIRTYPE
314+ _cache .setdefault (partial_name , dirinfo )
315+
316+ # add the entry itself, potentially overwriting implicit entries
317+ _cache [_name ] = info
318+
319+ self ._directory_cache = _cache
294320 return self ._directory_cache
295321
322+ def _follow_symlink (self , entry ):
323+ """Follow an symlink `TarInfo` to find a concrete entry."""
324+ _entry = entry
325+ while _entry .issym ():
326+ linkname = normpath (
327+ join (dirname (self ._decode (_entry .name )), self ._decode (_entry .linkname ))
328+ )
329+ resolved = self ._resolve (linkname )
330+ if resolved is None :
331+ raise errors .ResourceNotFound (linkname )
332+ _entry = self ._directory_entries [resolved ]
333+
334+ return _entry
335+
336+ def _resolve (self , path ):
337+ """Replace path components that are symlinks with concrete components.
338+
339+ Returns:
340+
341+
342+ """
343+ if path in self ._directory_entries or not path :
344+ return path
345+ for prefix in map (relpath , reversed (recursepath (path ))):
346+ suffix = relativefrom (prefix , path )
347+ entry = self ._directory_entries .get (prefix )
348+ if entry is not None and entry .issym ():
349+ entry = self ._follow_symlink (entry )
350+ return self ._resolve (join (self ._decode (entry .name ), suffix ))
351+ return None
352+
296353 def __repr__ (self ):
297354 # type: () -> Text
298355 return "ReadTarFS({!r})" .format (self ._file )
@@ -327,31 +384,35 @@ def getinfo(self, path, namespaces=None):
327384 namespaces = namespaces or ()
328385 raw_info = {} # type: Dict[Text, Dict[Text, object]]
329386
387+ # special case for root
330388 if not _path :
331389 raw_info ["basic" ] = {"name" : "" , "is_dir" : True }
332390 if "details" in namespaces :
333391 raw_info ["details" ] = {"type" : int (ResourceType .directory )}
334392
335393 else :
336- try :
337- implicit = False
338- member = self ._directory_entries [_path ]
339- except KeyError :
340- if not self .isdir (_path ):
341- raise errors .ResourceNotFound (path )
342- implicit = True
343- member = tarfile .TarInfo (_path )
344- member .type = tarfile .DIRTYPE
394+
395+ _realpath = self ._resolve (_path )
396+ if _realpath is None :
397+ raise errors .ResourceNotFound (path )
398+
399+ implicit = False
400+ member = self ._directory_entries [_realpath ]
345401
346402 raw_info ["basic" ] = {
347403 "name" : basename (self ._decode (member .name )),
348- "is_dir" : member .isdir (),
404+ "is_dir" : self .isdir (_path ), # is_dir should follow symlinks
349405 }
350406
351407 if "link" in namespaces :
352- raw_info ["link" ] = {
353- "target" : self ._decode (member .linkname ) if member .issym () else None
354- }
408+ if member .issym ():
409+ target = join (
410+ dirname (self ._decode (member .name )),
411+ self ._decode (member .linkname ),
412+ )
413+ else :
414+ target = None
415+ raw_info ["link" ] = {"target" : target }
355416 if "details" in namespaces :
356417 raw_info ["details" ] = {
357418 "size" : member .size ,
@@ -381,23 +442,29 @@ def getinfo(self, path, namespaces=None):
381442
382443 def isdir (self , path ):
383444 _path = relpath (self .validatepath (path ))
384- try :
385- return self ._directory_entries [_path ].isdir ()
386- except KeyError :
387- return any (isbase (_path , name ) for name in self ._directory_entries )
445+ realpath = self ._resolve (_path )
446+ if realpath is not None :
447+ entry = self ._directory_entries [realpath ]
448+ return self ._follow_symlink (entry ).isdir ()
449+ else :
450+ return False
388451
389452 def isfile (self , path ):
390453 _path = relpath (self .validatepath (path ))
391- try :
392- return self ._directory_entries [_path ].isfile ()
393- except KeyError :
454+ realpath = self ._resolve (_path )
455+ if realpath is not None :
456+ entry = self ._directory_entries [realpath ]
457+ return self ._follow_symlink (entry ).isfile ()
458+ else :
394459 return False
395460
396461 def islink (self , path ):
397462 _path = relpath (self .validatepath (path ))
398- try :
399- return self ._directory_entries [_path ].issym ()
400- except KeyError :
463+ realpath = self ._resolve (_path )
464+ if realpath is not None :
465+ entry = self ._directory_entries [realpath ]
466+ return entry .issym ()
467+ else :
401468 return False
402469
403470 def setinfo (self , path , info ):
@@ -409,13 +476,28 @@ def listdir(self, path):
409476 # type: (Text) -> List[Text]
410477 _path = relpath (self .validatepath (path ))
411478
412- if not self .gettype (path ) is ResourceType .directory :
413- raise errors .DirectoryExpected (path )
479+ # check the given path exists
480+ realpath = self ._resolve (_path )
481+ if realpath is None :
482+ raise errors .ResourceNotFound (path )
483+ elif realpath :
484+ target = self ._follow_symlink (self ._directory_entries [realpath ])
485+ # check the path is either a symlink mapping to a directory or a directory
486+ if target .isdir ():
487+ base = target .name
488+ elif target .issym ():
489+ base = target .linkname
490+ else :
491+ raise errors .DirectoryExpected (path )
492+ else :
493+ base = ""
414494
495+ # find all entries in the actual directory
415496 children = (
416- frombase (_path , n ) for n in self ._directory_entries if isbase (_path , n )
497+ frombase (base , n ) for n in self ._directory_entries if isbase (base , n )
417498 )
418499 content = (parts (child )[1 ] for child in children if relpath (child ))
500+
419501 return list (OrderedDict .fromkeys (content ))
420502
421503 def makedir (
@@ -432,17 +514,18 @@ def openbin(self, path, mode="r", buffering=-1, **options):
432514 # type: (Text, Text, int, **Any) -> BinaryIO
433515 _path = relpath (self .validatepath (path ))
434516
517+ # check the requested mode is only a reading mode
435518 if "w" in mode or "+" in mode or "a" in mode :
436519 raise errors .ResourceReadOnly (path )
437520
438- try :
439- member = self ._directory_entries [ _path ]
440- except KeyError :
441- six . raise_from ( errors .ResourceNotFound (path ), None )
521+ # check the path actually resolves after following symlinks
522+ _realpath = self ._resolve ( _path )
523+ if _realpath is None :
524+ raise errors .ResourceNotFound (path )
442525
443- # TarFile.extractfile returns None if the entry is
526+ # TarFile.extractfile returns None if the entry is not a file
444527 # neither a file nor a symlink
445- reader = self ._tar .extractfile (member )
528+ reader = self ._tar .extractfile (self . _directory_entries [ _realpath ] )
446529 if reader is None :
447530 raise errors .FileExpected (path )
448531
0 commit comments