From 00497cfc6c53ba244d1e268b8d0b646294b4f4fb Mon Sep 17 00:00:00 2001 From: Mohamed Abdelhady Date: Thu, 7 Dec 2017 19:49:17 -0800 Subject: [PATCH] Update 1_Download_and_Parse_XML_Spark.py --- .../1_Download_and_Parse_XML_Spark.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/01_data_acquisition_and_understanding/1_Download_and_Parse_XML_Spark.py b/code/01_data_acquisition_and_understanding/1_Download_and_Parse_XML_Spark.py index a30d3aa..72d853d 100644 --- a/code/01_data_acquisition_and_understanding/1_Download_and_Parse_XML_Spark.py +++ b/code/01_data_acquisition_and_understanding/1_Download_and_Parse_XML_Spark.py @@ -81,7 +81,7 @@ def download_xml_gz_files(): print(len(file_collection)) for i in range(1, num_xml_files+1, batch_size): - file_collection = ['medline17n%04d.xml.gz' % j + file_collection = ['medline18n%04d.xml.gz' % j for j in range(i, min([i + batch_size, num_xml_files +1]) ) if not os.path.exists(os.path.join(xml_local_dir,'medline17n%04d.xml.gz' % j))] @@ -119,7 +119,7 @@ def process_files(): print('The directory {} does not exist'.format(xml_local_dir)) for i in range(1, num_xml_files+1, batch_size): - file_collection = [os.path.join(xml_local_dir,'medline17n%04d.xml.gz' % j) + file_collection = [os.path.join(xml_local_dir,'medline18n%04d.xml.gz' % j) for j in range(i, i + batch_size) if os.path.exists(os.path.join(xml_local_dir,'medline17n%04d.xml.gz' % j))]