Integrate scancode data

These changes are work towards tern-tools#480 This commit adds a setter for file_type in the FileData class and also integrates the scancode data into Tern's data model.
aditi137 · Mar 6, 2020 · c9ff72f · c9ff72f
2 parents a4c107f + 403645f
commit c9ff72f
Show file tree

Hide file tree

Showing 4 changed files with 117 additions and 37 deletions.
diff --git a/tern/analyze/passthrough.py b/tern/analyze/passthrough.py
@@ -9,6 +9,7 @@
 
 
 import logging
+import os
 import shutil
 from stevedore import driver
 from stevedore.exception import NoMatches
@@ -21,25 +22,45 @@
 logger = logging.getLogger(constants.logger_name)
 
 
-def get_filesystem_command(layer_obj, command):
- '''Given an ImageLayer object and a command in the form of a string,
- return the command in list form with the target directory of the layer.
- This assumes that the layer tarball is untarred, which should have happened
- during the loading of the Image object'''
- cmd_list = command.split(' ')
+def get_exec_command(command_string):
+ '''Given a command as a string, find out if the command exists on the
+ system. If it does exist, return a subprocess invokable command list
+ where the command is the absolute path of the binary existing on the
+ system'''
+ cmd_list = command_string.split(' ')
  # we first find if the command exists on the system
  run_bin = cmd_list.pop(0)
  bin_path = shutil.which(run_bin)
  if not bin_path:
  raise OSError("Command {} not found".format(run_bin))
  cmd_list.insert(0, bin_path)
+ return cmd_list
+
+
+def get_filesystem_command(layer_obj, command):
+ '''Given an ImageLayer object and a command in the form of a string,
+ return the command in list form with the target directory of the layer.
+ This assumes that the layer tarball is untarred, which should have happened
+ during the loading of the Image object'''
  # in most cases, the external tool has a CLI where the target directory
  # is the last token in the command. So the most straightforward way
  # to perform this operation is to append the target directory
+ cmd_list = get_exec_command(command)
  cmd_list.append(rootfs.get_untar_dir(layer_obj.tar_file))
  return cmd_list
 
 
+def get_file_command(layer_tar_file, layer_file, command):
+ '''Given an ImageLayer object's tar_file property and a FileData object
+ from that layer, along with the command, return the command in list form
+ with the target file appended at the end'''
+ cmd_list = get_exec_command(command)
+ file_path = os.path.join(
+ rootfs.get_untar_dir(layer_tar_file), layer_file.path)
+ cmd_list.append(file_path)
+ return cmd_list
+
+
 def execute_external_command(layer_obj, command, is_sudo=False):
  '''Given an Imagelayer object and a command in the form of a list, execute
  the command and store the results in the ImageLayer object either as

diff --git a/tern/classes/file_data.py b/tern/classes/file_data.py
@@ -111,6 +111,10 @@ def version(self):
  def file_type(self):
  return self.__file_type
 
+ @file_type.setter
+ def file_type(self, file_type):
+ self.__file_type = file_type
+
  @property
  def origins(self):
  return self.__origins

diff --git a/tern/extensions/scancode/executor.py b/tern/extensions/scancode/executor.py
@@ -16,10 +16,11 @@
 
 import json
 import logging
-import sys
 
 from tern.analyze.passthrough import get_filesystem_command
+from tern.analyze.passthrough import get_file_command
 from tern.classes.notice import Notice
+from tern.classes.file_data import FileData
 from tern.extensions.executor import Executor
 from tern.utils import constants
 from tern.utils import rootfs
@@ -28,44 +29,96 @@
 logger = logging.getLogger(constants.logger_name)
 
 
-def run_on_image(image_obj, command):
- '''Scancode errors out when it fails to scan any file it is given even
- if it is successful with other files. Hence we cannot use the available
- run_on_image function in the passthrough module. Instead we will check
- if a json object was returned or not'''
- if not command:
- logger.error("No command to execute. No report will be generated")
- return False
- for layer in image_obj.layers:
- layer.files_analyzed = True
- full_cmd = get_filesystem_command(layer, command)
- origin_layer = 'Layer: ' + layer.fs_hash[:10]
+def analyze_layer(layer_obj):
+ '''Use scancode to analyze the layer's contents. Create file objects
+ and add them to the layer object. Add any Notices to the FileData objects
+ '''
+ # run scancode against a directory
+ command = 'scancode -ilpcu --quiet --json -'
+ full_cmd = get_filesystem_command(layer_obj, command)
+ origin_layer = 'Layer: ' + layer_obj.fs_hash[:10]
+ result, error = rootfs.shell_command(True, full_cmd)
+ if not result:
+ logger.error(
+ "No scancode results for this layer: %s", str(error))
+ layer_obj.origins.add_notice_to_origins(
+ origin_layer, Notice(str(error), 'error'))
+ else:
+ # make FileData objects for each result
+ data = json.loads(result)
+ for f in data['files']:
+ if f['type'] == 'file':
+ fd = FileData(f['name'], f['path'], f['date'], f['file_type'])
+ fd.set_checksum('sha1', f['sha1'])
+ if f['licenses']:
+ fd.licenses = [l['short_name'] for l in f['licenses']]
+ fd.license_expressions = f['license_expressions']
+ if f['copyrights']:
+ fd.copyrights = [c['value'] for c in f['copyrights']]
+ if f['urls']:
+ fd.urls = [u['url'] for u in f['urls']]
+ fd.packages = f['packages']
+ fd.authors = f['authors']
+ if f['scan_errors']:
+ # for each scan error make a notice
+ for err in f['scan_errors']:
+ fd.origins.add_notice_to_origins(
+ 'File: ' + fd.path, Notice(err, 'error'))
+ # add filedata object to layer
+ layer_obj.add_file(fd)
+
+
+def analyze_file(layer_obj):
+ '''Use scancode to analyze files Tern has already found in an image layer.
+ For each file in the layer, run scancode on the file. We assume that we
+ already have the files names, paths and checksums filled out'''
+ # run scancode against each file
+ command = 'scancode -ilpcu --quiet --json -'
+ for fd in layer_obj.files:
+ full_cmd = get_file_command(layer_obj.tar_file, fd.path, command)
+ origin_file = 'File: ' + fd.path
  result, error = rootfs.shell_command(True, full_cmd)
  if not result:
  logger.error(
- "No scancode results for this layer: %s", str(error))
- layer.origins.add_notice_to_origins(
- origin_layer, Notice(str(error), 'error'))
- layer.analyzed_output = result.decode()
- return True
+ "No scancode results for this file: %s", str(error))
+ fd.origins.add_notice_to_origins(
+ origin_file, Notice(str(error), 'error'))
+ else:
+ # Fill the results into the FileData object
+ data = json.loads(result)['files'][0]
+ fd.date = data['date']
+ fd.file_type = data['file_type']
+ if data['licenses']:
+ fd.licenses = [l['short_name'] for l in data['licenses']]
+ fd.license_expressions = data['license_expressions']
+ if data['copyrights']:
+ fd.copyrights = [c['value'] for c in data['copyrights']]
+ if data['urls']:
+ fd.urls = [u['url'] for u in data['urls']]
+ fd.packages = data['packages']
+ fd.authors = data['authors']
+ if data['scan_errors']:
+ # for each scan error make a notice
+ for err in data['scan_errors']:
+ fd.origins.add_notice_to_origins(
+ origin_file, Notice(err, 'error'))
+ # add filedata object to layer
+ layer_obj.add_file(fd)
 
 
 class Scancode(Executor):
  '''Execute scancode'''
  def execute(self, image_obj):
  '''Execution should be:
- scancode -lpcu --quiet --json - /path/to/directory
+ scancode -ilpcu --quiet --json - /path/to/directory
  '''
- command = 'scancode -lpcu --quiet --json -'
- # run the command against the image filesystems
- if not run_on_image(image_obj, command):
- sys.exit(1)
- # for now we just print the file path and licenses found if there are
- # any licenses are found
  for layer in image_obj.layers:
- print('Layer: {}'.format(layer.diff_id[:10]))
- results = json.loads(layer.analyzed_output)
- for afile in results['files']:
- if afile['licenses']:
- license_str = ','.join(l['key'] for l in afile['licenses'])
- print('{}: {}'.format(afile['path'], license_str))
+ layer.files_analyzed = True
+ if layer.files:
+ # If the layer already has files processed, then run
+ # scancode per file
+ analyze_file(layer)
+ else:
+ # If there was no file processing done, scancode will process
+ # them for you
+ analyze_layer(layer)
diff --git a/tests/test_class_file_data.py b/tests/test_class_file_data.py
@@ -41,6 +41,8 @@ def testInstance(self):
  file2 = FileData('file2',
  'path/to/file2',
  '12355')
+ file1.file_type = 'ELF'
+ self.assertEqual(file1.file_type, 'ELF')
  file2 = FileData('file2',
  'path/to/file2',
  '2020-01-01',