88import sys
99import argparse
1010import gensim
11+ import ast
12+ import astor
13+ import re
1114import tempfile
1215import json
1316from enum import Enum
@@ -55,6 +58,37 @@ def conditional_print(text, machine_friendly_output):
5558 if not machine_friendly_output :
5659 print (text )
5760
61+ def remove_comments_and_docstrings (source_code : str ) -> str :
62+ """Strip comments and docstrings from source code
63+
64+ .. seealso::
65+
66+ https://gist.github.com/phpdude/1ae6f19de213d66286c8183e9e3b9ec1
67+
68+ :param source_code: Raw source code as a single string
69+ :type source_code: str
70+ :return: Stripped source code as a single string
71+ :rtype: str
72+ """
73+ parsed = ast .parse (source_code )
74+ for node in ast .walk (parsed ):
75+ if not isinstance (node , (ast .FunctionDef , ast .ClassDef , ast .AsyncFunctionDef , ast .Module )):
76+ continue
77+
78+ if not len (node .body ):
79+ continue
80+
81+ if not isinstance (node .body [0 ], ast .Expr ):
82+ continue
83+
84+ if not hasattr (node .body [0 ], 'value' ) or not isinstance (node .body [0 ].value , ast .Str ):
85+ continue
86+
87+ node .body = node .body [1 :]
88+
89+ source_code_clean = astor .to_source (parsed )
90+ return source_code_clean
91+
5892
5993def main ():
6094 parser_description = CliColors .HEADER + CliColors .BOLD + \
@@ -78,17 +112,18 @@ def main():
78112 help = "File extensions to check for similarities." )
79113 parser .add_argument ("--ignore-threshold" , type = int , default = 0 ,
80114 help = "Don't print out similarity below the ignore threshold" )
115+ parser .add_argument ("--only-code" , action = "store_true" , help = "Removes comments and docstrings from the source code before analysis" )
81116 args = parser .parse_args ()
82117
83118 result = run (args .fail_threshold , args .directories , args .files , args .ignore_directories ,
84119 args .ignore_files , args .json , args .project_root_dir , args .file_extensions ,
85- args .ignore_threshold )
120+ args .ignore_threshold , args . only_code )
86121
87122 return result
88123
89124
90125def run (fail_threshold , directories , files , ignore_directories , ignore_files ,
91- json_output , project_root_dir , file_extensions , ignore_threshold ):
126+ json_output , project_root_dir , file_extensions , ignore_threshold , only_code , ):
92127 # Determine which files to compare for similarities
93128 source_code_files = list ()
94129 files_to_ignore = list ()
@@ -139,7 +174,10 @@ def run(fail_threshold, directories, files, ignore_directories, ignore_files,
139174 # read file but also recover from encoding errors in source files
140175 with open (source_code_file , 'r' , errors = 'surrogateescape' ) as f :
141176 # Store source code with the file path as the key
142- source_code [source_code_file ] = f .read ()
177+ content = f .read ()
178+ if only_code :
179+ content = remove_comments_and_docstrings (content )
180+ source_code [source_code_file ] = content
143181 except Exception as err :
144182 print (f'ERROR: Failed to open file { source_code_file } , reason: { str (err )} ' )
145183
0 commit comments