Skip to content

Commit e3c9b36

Browse files
author
Miltos
authored
Update confusing CLI command
1 parent 96f6732 commit e3c9b36

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

tokenizers/python/tokenizepythoncorpus.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
tokenizepythoncorpus.py [options] PROJECTS_FOLDER OUTPUT_FOLDER
66
77
Options:
8-
--only-ids Return only identifiers
8+
--all-tokens Return only identifiers
99
-h --help Show this screen.
1010
1111
"""
@@ -22,12 +22,12 @@
2222

2323
from dpu_utils.utils import save_jsonl_gz
2424

25-
def tokenize_file(filepath: str, only_ids: bool=False)-> Iterator[str]:
25+
def tokenize_file(filepath: str, all_tokens: bool=False)-> Iterator[str]:
2626
tokens = []
2727
try:
2828
with open(filepath, 'rb') as f:
2929
for toknum, tokval, _, _, _ in tokenize(f.readline):
30-
if not only_ids or toknum in {NAME, STRING}:
30+
if all_tokens or toknum in {NAME, STRING}:
3131
if not keyword.iskeyword(tokval):
3232
tokens.append(tokval)
3333
except Exception as e:
@@ -54,5 +54,5 @@ def tokenize_all_project_folders(directory: str, output_folder: str, only_ids: b
5454

5555
if __name__ == '__main__':
5656
args = docopt(__doc__)
57-
tokenize_all_project_folders(args['PROJECTS_FOLDER'], args['OUTPUT_FOLDER'], args['--only-ids'])
57+
tokenize_all_project_folders(args['PROJECTS_FOLDER'], args['OUTPUT_FOLDER'], args['--all-tokens'])
5858

0 commit comments

Comments
 (0)