From 0bdb3b9d142c3180f4b8ffbe040705b93973e10f Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Mon, 7 Apr 2025 13:15:22 +0300 Subject: [PATCH 01/13] removed creating of tempFile from inputstream, used imputstream directly --- .../content/PreviewContentServiceImpl.java | 68 ++++++++++--------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java index e6cd2ec12f76..e8b27feae128 100644 --- a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java +++ b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java @@ -14,11 +14,9 @@ import java.io.UnsupportedEncodingException; import java.nio.charset.StandardCharsets; import java.nio.file.FileSystem; -import java.nio.file.FileSystems; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.nio.file.StandardCopyOption; import java.sql.SQLException; import java.util.ArrayList; import java.util.Arrays; @@ -31,6 +29,8 @@ import java.util.function.Function; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; @@ -311,38 +311,40 @@ private void addFilePath(List filePaths, String path, long size) { filePaths.add(fileInfo); } - /** - * Processes a TAR file, extracting its entries and adding their paths to the provided list. - * @param filePaths the list to populate with the extracted file paths - * @param tempFile the temporary TAR file to process - * @throws IOException if an I/O error occurs while reading the TAR file - */ - private void processTarFile(List filePaths, Path tempFile) throws IOException { - try (InputStream fi = Files.newInputStream(tempFile); - TarArchiveInputStream tis = new TarArchiveInputStream(fi)) { +// /** +// * Processes a TAR file, extracting its entries and adding their paths to the provided list. +// * @param filePaths the list to populate with the extracted file paths +// * @param tempFile the temporary TAR file to process +// * @throws IOException if an I/O error occurs while reading the TAR file +// */ + private void processTarFile(List filePaths, InputStream inputStream) throws IOException { + try (TarArchiveInputStream tis = new TarArchiveInputStream(inputStream)) { TarArchiveEntry entry; while ((entry = tis.getNextTarEntry()) != null) { + // Add the file path and its size (from the TAR entry) addFilePath(filePaths, entry.getName(), entry.getSize()); } } } - /** - * Processes a ZIP file, extracting its entries and adding their paths to the provided list. - * @param filePaths the list to populate with the extracted file paths - * @param zipFileSystem the FileSystem object representing the ZIP file - * @throws IOException if an I/O error occurs while reading the ZIP file - */ - private void processZipFile(List filePaths, FileSystem zipFileSystem) throws IOException { - Path root = zipFileSystem.getPath("/"); - Files.walk(root).forEach(path -> { - try { - long fileSize = Files.size(path); - addFilePath(filePaths, path.toString().substring(1), fileSize); - } catch (IOException e) { - log.error("An error occurred while getting the size of the zip file.", e); +// /** +// * Processes a ZIP file, extracting its entries and adding their paths to the provided list. +// * @param filePaths the list to populate with the extracted file paths +// * @param zipFileSystem the FileSystem object representing the ZIP file +// * @throws IOException if an I/O error occurs while reading the ZIP file +// */ + private void processZipFile(List filePaths, InputStream inputStream) throws IOException { + try (ZipInputStream zipInputStream = new ZipInputStream(inputStream)) { + ZipEntry entry; + // Read each entry in the ZIP file + while ((entry = zipInputStream.getNextEntry()) != null) { + if (!entry.isDirectory()) { + // For each file in the ZIP, you can get its size and path + long fileSize = entry.getSize(); + addFilePath(filePaths, entry.getName(), fileSize); + } } - }); + } } /** @@ -414,26 +416,26 @@ private String buildXmlResponse(List filePaths) { */ private String extractFile(InputStream inputStream, String fileType) throws Exception { List filePaths = new ArrayList<>(); - Path tempFile = null; + //Path tempFile = null; FileSystem zipFileSystem = null; try { // Create a temporary file based on the file type - tempFile = createTempFile(fileType); + //tempFile = createTempFile(fileType); // Copy the input stream to the temporary file - Files.copy(inputStream, tempFile, StandardCopyOption.REPLACE_EXISTING); + //Files.copy(inputStream, tempFile, StandardCopyOption.REPLACE_EXISTING); // Process the file based on its type if (ARCHIVE_TYPE_TAR.equals(fileType)) { - processTarFile(filePaths, tempFile); + processTarFile(filePaths, inputStream); } else { - zipFileSystem = FileSystems.newFileSystem(tempFile, (ClassLoader) null); - processZipFile(filePaths, zipFileSystem); + //zipFileSystem = FileSystems.newFileSystem(tempFile, (ClassLoader) null); + processZipFile(filePaths, inputStream); } } finally { closeFileSystem(zipFileSystem); - deleteTempFile(tempFile); + //deleteTempFile(tempFile); } return buildXmlResponse(filePaths); From a23b1d1ff342b99caf819efac595096e964d1ece Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Tue, 8 Apr 2025 08:44:57 +0300 Subject: [PATCH 02/13] added methods documentation --- .../content/PreviewContentServiceImpl.java | 61 +++++++------------ 1 file changed, 23 insertions(+), 38 deletions(-) diff --git a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java index e8b27feae128..7ba5bc705b94 100644 --- a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java +++ b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java @@ -311,35 +311,36 @@ private void addFilePath(List filePaths, String path, long size) { filePaths.add(fileInfo); } -// /** -// * Processes a TAR file, extracting its entries and adding their paths to the provided list. -// * @param filePaths the list to populate with the extracted file paths -// * @param tempFile the temporary TAR file to process -// * @throws IOException if an I/O error occurs while reading the TAR file -// */ + /** + * Processes a TAR file, extracting its entries and adding their paths to the provided list. + * @param filePaths the list to populate with the extracted file paths + * @param inputStream the input stream for the process + * @throws IOException if an I/O error occurs while reading the TAR file + */ private void processTarFile(List filePaths, InputStream inputStream) throws IOException { try (TarArchiveInputStream tis = new TarArchiveInputStream(inputStream)) { TarArchiveEntry entry; while ((entry = tis.getNextTarEntry()) != null) { - // Add the file path and its size (from the TAR entry) - addFilePath(filePaths, entry.getName(), entry.getSize()); + if (!entry.isDirectory()) { + // Add the file path and its size (from the TAR entry) + addFilePath(filePaths, entry.getName(), entry.getSize()); + } } } } -// /** -// * Processes a ZIP file, extracting its entries and adding their paths to the provided list. -// * @param filePaths the list to populate with the extracted file paths -// * @param zipFileSystem the FileSystem object representing the ZIP file -// * @throws IOException if an I/O error occurs while reading the ZIP file -// */ + /** + * Processes a ZIP file, extracting its entries and adding their paths to the provided list. + * @param filePaths the list to populate with the extracted file paths + * @param inputStream the FileSystem object representing the ZIP file + * @throws IOException if an I/O error occurs while reading the ZIP file + */ private void processZipFile(List filePaths, InputStream inputStream) throws IOException { try (ZipInputStream zipInputStream = new ZipInputStream(inputStream)) { ZipEntry entry; - // Read each entry in the ZIP file while ((entry = zipInputStream.getNextEntry()) != null) { if (!entry.isDirectory()) { - // For each file in the ZIP, you can get its size and path + // Add the file path and its size (from the ZIP entry) long fileSize = entry.getSize(); addFilePath(filePaths, entry.getName(), fileSize); } @@ -408,7 +409,7 @@ private String buildXmlResponse(List filePaths) { } /** - * Extracts files from an InputStream, processes them based on the specified file type (tar or zip), + * Processes file data based on the specified file type (tar or zip), * and returns an XML representation of the file paths. * @param inputStream the InputStream containing the file data * @param fileType the type of file to extract ("tar" or "zip") @@ -416,28 +417,12 @@ private String buildXmlResponse(List filePaths) { */ private String extractFile(InputStream inputStream, String fileType) throws Exception { List filePaths = new ArrayList<>(); - //Path tempFile = null; - FileSystem zipFileSystem = null; - - try { - // Create a temporary file based on the file type - //tempFile = createTempFile(fileType); - - // Copy the input stream to the temporary file - //Files.copy(inputStream, tempFile, StandardCopyOption.REPLACE_EXISTING); - - // Process the file based on its type - if (ARCHIVE_TYPE_TAR.equals(fileType)) { - processTarFile(filePaths, inputStream); - } else { - //zipFileSystem = FileSystems.newFileSystem(tempFile, (ClassLoader) null); - processZipFile(filePaths, inputStream); - } - } finally { - closeFileSystem(zipFileSystem); - //deleteTempFile(tempFile); + // Process the file based on its type + if (ARCHIVE_TYPE_TAR.equals(fileType)) { + processTarFile(filePaths, inputStream); + } else { + processZipFile(filePaths, inputStream); } - return buildXmlResponse(filePaths); } From 7f49fe1c820e47d5695e3fe879716bcf276a3b78 Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Tue, 8 Apr 2025 08:46:10 +0300 Subject: [PATCH 03/13] fix documentation --- .../java/org/dspace/content/PreviewContentServiceImpl.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java index 7ba5bc705b94..c6b130c3b12d 100644 --- a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java +++ b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java @@ -314,7 +314,7 @@ private void addFilePath(List filePaths, String path, long size) { /** * Processes a TAR file, extracting its entries and adding their paths to the provided list. * @param filePaths the list to populate with the extracted file paths - * @param inputStream the input stream for the process + * @param inputStream the TAR file data * @throws IOException if an I/O error occurs while reading the TAR file */ private void processTarFile(List filePaths, InputStream inputStream) throws IOException { @@ -332,7 +332,7 @@ private void processTarFile(List filePaths, InputStream inputStream) thr /** * Processes a ZIP file, extracting its entries and adding their paths to the provided list. * @param filePaths the list to populate with the extracted file paths - * @param inputStream the FileSystem object representing the ZIP file + * @param inputStream the ZIP file data * @throws IOException if an I/O error occurs while reading the ZIP file */ private void processZipFile(List filePaths, InputStream inputStream) throws IOException { From 043792c51f93cb1086003a2fdfd7e0bb972238b6 Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Tue, 8 Apr 2025 16:08:47 +0300 Subject: [PATCH 04/13] added parallelism and buffer for faster run --- .../content/PreviewContentServiceImpl.java | 87 +++++++++++++++---- 1 file changed, 72 insertions(+), 15 deletions(-) diff --git a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java index c6b130c3b12d..81f95a3cd645 100644 --- a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java +++ b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java @@ -7,6 +7,7 @@ */ package org.dspace.content; +import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; @@ -26,6 +27,11 @@ import java.util.Map; import java.util.Objects; import java.util.UUID; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; import java.util.function.Function; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -312,39 +318,90 @@ private void addFilePath(List filePaths, String path, long size) { } /** - * Processes a TAR file, extracting its entries and adding their paths to the provided list. - * @param filePaths the list to populate with the extracted file paths - * @param inputStream the TAR file data + * Processes a TAR file, extracting its entries and adding their paths and sizes to the provided list. + * Utilizes parallelism to process each TAR entry concurrently. + * + * @param filePaths the list to populate with the extracted file paths and sizes + * @param inputStream the InputStream containing the TAR file data * @throws IOException if an I/O error occurs while reading the TAR file + * @throws InterruptedException if the current thread is interrupted while waiting for the completion of a task + * @throws ExecutionException if an exception occurs during the execution of a parallel task */ - private void processTarFile(List filePaths, InputStream inputStream) throws IOException { - try (TarArchiveInputStream tis = new TarArchiveInputStream(inputStream)) { + private void processTarFile(List filePaths, InputStream inputStream) + throws IOException, InterruptedException, ExecutionException { + int numProcessors = Runtime.getRuntime().availableProcessors(); + ExecutorService executorService = Executors.newFixedThreadPool(numProcessors); + List>> futures = new ArrayList<>(); + + try (BufferedInputStream bufferedStream = new BufferedInputStream(inputStream); + TarArchiveInputStream tis = new TarArchiveInputStream(bufferedStream)) { + TarArchiveEntry entry; while ((entry = tis.getNextTarEntry()) != null) { if (!entry.isDirectory()) { - // Add the file path and its size (from the TAR entry) - addFilePath(filePaths, entry.getName(), entry.getSize()); + String entryName = entry.getName(); + long fileSize = entry.getSize(); + + // Submit a task to process each entry and return a list containing the path + Callable> task = () -> { + List localPaths = new ArrayList<>(); + addFilePath(localPaths, entryName, fileSize); + return localPaths; + }; + futures.add(executorService.submit(task)); } } + + // Collect results from all tasks in a thread-safe manner + for (Future> future : futures) { + filePaths.addAll(future.get()); + } + + } finally { + executorService.shutdown(); } } /** - * Processes a ZIP file, extracting its entries and adding their paths to the provided list. - * @param filePaths the list to populate with the extracted file paths - * @param inputStream the ZIP file data - * @throws IOException if an I/O error occurs while reading the ZIP file + * Processes a ZIP file, extracting its entries and adding their paths and sizes to the provided list. + * Utilizes parallelism to process each ZIP entry concurrently. + * + * @param filePaths the list to populate with the extracted file paths and sizes + * @param inputStream the InputStream containing the ZIP file data + * @throws IOException if an I/O error occurs while reading the ZIP file + * @throws InterruptedException if the current thread is interrupted while waiting for the completion of a task + * @throws ExecutionException if an exception occurs during the execution of a parallel task */ - private void processZipFile(List filePaths, InputStream inputStream) throws IOException { - try (ZipInputStream zipInputStream = new ZipInputStream(inputStream)) { + private void processZipFile(List filePaths, InputStream inputStream) + throws IOException, InterruptedException, ExecutionException { + int numProcessors = Runtime.getRuntime().availableProcessors(); + ExecutorService executorService = Executors.newFixedThreadPool(numProcessors); + List>> futures = new ArrayList<>(); + + try (BufferedInputStream bufferedStream = new BufferedInputStream(inputStream); + ZipInputStream zipInputStream = new ZipInputStream(bufferedStream)) { + ZipEntry entry; while ((entry = zipInputStream.getNextEntry()) != null) { if (!entry.isDirectory()) { - // Add the file path and its size (from the ZIP entry) + String entryName = entry.getName(); long fileSize = entry.getSize(); - addFilePath(filePaths, entry.getName(), fileSize); + + Callable> task = () -> { + List localPaths = new ArrayList<>(); + addFilePath(localPaths, entryName, fileSize); + return localPaths; + }; + futures.add(executorService.submit(task)); } } + + for (Future> future : futures) { + filePaths.addAll(future.get()); // Thread-safe addition + } + + } finally { + executorService.shutdown(); } } From cbffb8d8ea3e7286bca59d8cec637dfd63c880fd Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Wed, 9 Apr 2025 10:09:52 +0300 Subject: [PATCH 05/13] removed unneeded methods --- .../content/PreviewContentServiceImpl.java | 40 ------------------- 1 file changed, 40 deletions(-) diff --git a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java index 81f95a3cd645..716222d68cba 100644 --- a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java +++ b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java @@ -293,18 +293,6 @@ private Hashtable createSubMap(Map sourceMap, Funct return sub; } - /** - * Creates a temporary file with the appropriate extension based on the specified file type. - * @param fileType the type of file for which to create a temporary file - * @return a Path object representing the temporary file - * @throws IOException if an I/O error occurs while creating the file - */ - private Path createTempFile(String fileType) throws IOException { - String extension = ARCHIVE_TYPE_TAR.equals(fileType) ? - String.format(".%s", ARCHIVE_TYPE_TAR) : String.format(".%s", ARCHIVE_TYPE_ZIP); - return Files.createTempFile("temp", extension); - } - /** * Adds a file path and its size to the list of file paths. * If the path represents a directory, appends a "/" to the path. @@ -405,34 +393,6 @@ private void processZipFile(List filePaths, InputStream inputStream) } } - /** - * Closes the specified FileSystem resource if it is not null. - * @param zipFileSystem the FileSystem to close - */ - private void closeFileSystem(FileSystem zipFileSystem) { - if (Objects.nonNull(zipFileSystem)) { - try { - zipFileSystem.close(); - } catch (IOException e) { - log.error("An error occurred while closing the zip file.", e); - } - } - } - - /** - * Deletes the specified temporary file if it is not null. - * @param tempFile the Path object representing the temporary file to delete - */ - private void deleteTempFile(Path tempFile) { - if (Objects.nonNull(tempFile)) { - try { - Files.delete(tempFile); - } catch (IOException e) { - log.error("An error occurred while deleting temp file.", e); - } - } - } - /** * Builds an XML response string based on the provided list of file paths. * @param filePaths the list of file paths to include in the XML response From d2785c8992bb9da36a9660ebee01273d850372ac Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Wed, 9 Apr 2025 10:33:46 +0300 Subject: [PATCH 06/13] removed unused imports --- .../main/java/org/dspace/content/PreviewContentServiceImpl.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java index 716222d68cba..9d361014e563 100644 --- a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java +++ b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java @@ -14,9 +14,7 @@ import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.nio.charset.StandardCharsets; -import java.nio.file.FileSystem; import java.nio.file.Files; -import java.nio.file.Path; import java.nio.file.Paths; import java.sql.SQLException; import java.util.ArrayList; From 604e9ac05210774fbfd2d8558fb3710fb9ebbc13 Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Thu, 10 Apr 2025 07:38:21 +0300 Subject: [PATCH 07/13] determinated threat pool size, adding conditional parallelism for small archives --- .../content/PreviewContentServiceImpl.java | 86 ++++++++++++------- 1 file changed, 56 insertions(+), 30 deletions(-) diff --git a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java index 9d361014e563..14c75dc1905c 100644 --- a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java +++ b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java @@ -15,6 +15,7 @@ import java.io.UnsupportedEncodingException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; +import java.nio.file.Path; import java.nio.file.Paths; import java.sql.SQLException; import java.util.ArrayList; @@ -77,6 +78,9 @@ public class PreviewContentServiceImpl implements PreviewContentService { @Value("${file.preview.zip.limit.length:1000}") private int maxPreviewCount; + @Value("${file.preview.archive.thread.pool.size:#{null}}") + private Integer archiveThreadPoolSize; + @Autowired PreviewContentDAO previewContentDAO; @@ -299,7 +303,11 @@ private Hashtable createSubMap(Map sourceMap, Funct * @param size the size of the file or directory */ private void addFilePath(List filePaths, String path, long size) { - String fileInfo = (Files.isDirectory(Paths.get(path))) ? path + "/|" + size : path + "|" + size; + Path p = Paths.get(path); + if (!Files.exists(p)) { + throw new IllegalArgumentException("Path does not exist: " + path); + } + String fileInfo = Files.isDirectory(p) ? path + "/|" + size : path + "|" + size; filePaths.add(fileInfo); } @@ -315,8 +323,10 @@ private void addFilePath(List filePaths, String path, long size) { */ private void processTarFile(List filePaths, InputStream inputStream) throws IOException, InterruptedException, ExecutionException { - int numProcessors = Runtime.getRuntime().availableProcessors(); - ExecutorService executorService = Executors.newFixedThreadPool(numProcessors); + List entries = new ArrayList<>(); + int threadPoolSize = (archiveThreadPoolSize != null) ? + archiveThreadPoolSize : Runtime.getRuntime().availableProcessors(); + ExecutorService executorService = Executors.newFixedThreadPool(threadPoolSize); List>> futures = new ArrayList<>(); try (BufferedInputStream bufferedStream = new BufferedInputStream(inputStream); @@ -325,24 +335,30 @@ private void processTarFile(List filePaths, InputStream inputStream) TarArchiveEntry entry; while ((entry = tis.getNextTarEntry()) != null) { if (!entry.isDirectory()) { - String entryName = entry.getName(); - long fileSize = entry.getSize(); - - // Submit a task to process each entry and return a list containing the path - Callable> task = () -> { - List localPaths = new ArrayList<>(); - addFilePath(localPaths, entryName, fileSize); - return localPaths; - }; - futures.add(executorService.submit(task)); + entries.add(entry); } } - - // Collect results from all tasks in a thread-safe manner + // Process sequentially if below threshold + if (entries.size() < archiveThreadPoolSize) { + for (TarArchiveEntry e : entries) { + addFilePath(filePaths, e.getName(), e.getSize()); + } + return; + } + // Process in parallel if above threshold + for (TarArchiveEntry e : entries) { + String entryName = e.getName(); + long fileSize = e.getSize(); + Callable> task = () -> { + List localPaths = new ArrayList<>(); + addFilePath(localPaths, entryName, fileSize); + return localPaths; + }; + futures.add(executorService.submit(task)); + } for (Future> future : futures) { - filePaths.addAll(future.get()); + filePaths.addAll(future.get()); // Thread-safe addition } - } finally { executorService.shutdown(); } @@ -360,8 +376,10 @@ private void processTarFile(List filePaths, InputStream inputStream) */ private void processZipFile(List filePaths, InputStream inputStream) throws IOException, InterruptedException, ExecutionException { - int numProcessors = Runtime.getRuntime().availableProcessors(); - ExecutorService executorService = Executors.newFixedThreadPool(numProcessors); + List entries = new ArrayList<>(); + int threadPoolSize = (archiveThreadPoolSize != null) ? + archiveThreadPoolSize : Runtime.getRuntime().availableProcessors(); + ExecutorService executorService = Executors.newFixedThreadPool(threadPoolSize); List>> futures = new ArrayList<>(); try (BufferedInputStream bufferedStream = new BufferedInputStream(inputStream); @@ -370,22 +388,30 @@ private void processZipFile(List filePaths, InputStream inputStream) ZipEntry entry; while ((entry = zipInputStream.getNextEntry()) != null) { if (!entry.isDirectory()) { - String entryName = entry.getName(); - long fileSize = entry.getSize(); - - Callable> task = () -> { - List localPaths = new ArrayList<>(); - addFilePath(localPaths, entryName, fileSize); - return localPaths; - }; - futures.add(executorService.submit(task)); + entries.add(entry); } } - + // Process sequentially if below threshold + if (entries.size() < archiveThreadPoolSize) { + for (ZipEntry e : entries) { + addFilePath(filePaths, e.getName(), e.getSize()); + } + return; + } + // Process in parallel if above threshold + for (ZipEntry e : entries) { + String entryName = e.getName(); + long fileSize = e.getSize(); + Callable> task = () -> { + List localPaths = new ArrayList<>(); + addFilePath(localPaths, entryName, fileSize); + return localPaths; + }; + futures.add(executorService.submit(task)); + } for (Future> future : futures) { filePaths.addAll(future.get()); // Thread-safe addition } - } finally { executorService.shutdown(); } From 44812122f77b166d0ea882b121c9d9f3f65a3f7a Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Thu, 10 Apr 2025 09:24:31 +0300 Subject: [PATCH 08/13] checkstyleviolations, fix addPathFile --- .../org/dspace/content/PreviewContentServiceImpl.java | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java index 14c75dc1905c..5962d5820131 100644 --- a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java +++ b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java @@ -303,11 +303,7 @@ private Hashtable createSubMap(Map sourceMap, Funct * @param size the size of the file or directory */ private void addFilePath(List filePaths, String path, long size) { - Path p = Paths.get(path); - if (!Files.exists(p)) { - throw new IllegalArgumentException("Path does not exist: " + path); - } - String fileInfo = Files.isDirectory(p) ? path + "/|" + size : path + "|" + size; + String fileInfo = Files.isDirectory(Paths.get(path)) ? path + "/|" + size : path + "|" + size; filePaths.add(fileInfo); } @@ -377,9 +373,9 @@ private void processTarFile(List filePaths, InputStream inputStream) private void processZipFile(List filePaths, InputStream inputStream) throws IOException, InterruptedException, ExecutionException { List entries = new ArrayList<>(); - int threadPoolSize = (archiveThreadPoolSize != null) ? + int threadPoolSize = (archiveThreadPoolSize != null) ? archiveThreadPoolSize : Runtime.getRuntime().availableProcessors(); - ExecutorService executorService = Executors.newFixedThreadPool(threadPoolSize); + ExecutorService executorService = Executors.newFixedThreadPool(threadPoolSize); List>> futures = new ArrayList<>(); try (BufferedInputStream bufferedStream = new BufferedInputStream(inputStream); From ce529d454cae3fa7dc452ded87982e6f47418db7 Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Thu, 10 Apr 2025 09:30:46 +0300 Subject: [PATCH 09/13] removed unused import --- .../main/java/org/dspace/content/PreviewContentServiceImpl.java | 1 - 1 file changed, 1 deletion(-) diff --git a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java index 5962d5820131..314221e9a483 100644 --- a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java +++ b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java @@ -15,7 +15,6 @@ import java.io.UnsupportedEncodingException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; -import java.nio.file.Path; import java.nio.file.Paths; import java.sql.SQLException; import java.util.ArrayList; From 9a75cbbd64928fd0b3f797f8a67dffff9d041538 Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Mon, 14 Apr 2025 18:28:59 +0300 Subject: [PATCH 10/13] Revert "fix documentation" This reverts commit 7f49fe1c820e47d5695e3fe879716bcf276a3b78. --- .../dspace/content/PreviewContentServiceImpl.java | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java index 314221e9a483..81cedeb16b89 100644 --- a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java +++ b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java @@ -306,12 +306,18 @@ private void addFilePath(List filePaths, String path, long size) { filePaths.add(fileInfo); } - /** + /* + * Processes a TAR file, extracting its entries and adding their paths and sizes to the provided list. * Utilizes parallelism to process each TAR entry concurrently. * * @param filePaths the list to populate with the extracted file paths and sizes * @param inputStream the InputStream containing the TAR file data + + + + + * @throws IOException if an I/O error occurs while reading the TAR file * @throws InterruptedException if the current thread is interrupted while waiting for the completion of a task * @throws ExecutionException if an exception occurs during the execution of a parallel task @@ -360,6 +366,7 @@ private void processTarFile(List filePaths, InputStream inputStream) } /** + * Processes a ZIP file, extracting its entries and adding their paths and sizes to the provided list. * Utilizes parallelism to process each ZIP entry concurrently. * @@ -368,6 +375,12 @@ private void processTarFile(List filePaths, InputStream inputStream) * @throws IOException if an I/O error occurs while reading the ZIP file * @throws InterruptedException if the current thread is interrupted while waiting for the completion of a task * @throws ExecutionException if an exception occurs during the execution of a parallel task + + + + + + */ private void processZipFile(List filePaths, InputStream inputStream) throws IOException, InterruptedException, ExecutionException { From 39bf4731be3717878ed6b542c4c9d5d871141886 Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Mon, 14 Apr 2025 18:39:50 +0300 Subject: [PATCH 11/13] removed parallelism --- .../content/PreviewContentServiceImpl.java | 128 +++--------------- 1 file changed, 18 insertions(+), 110 deletions(-) diff --git a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java index 81cedeb16b89..24ca9e925271 100644 --- a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java +++ b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java @@ -7,7 +7,6 @@ */ package org.dspace.content; -import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; @@ -25,11 +24,6 @@ import java.util.Map; import java.util.Objects; import java.util.UUID; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; import java.util.function.Function; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -76,11 +70,7 @@ public class PreviewContentServiceImpl implements PreviewContentService { // Configured ZIP file preview limit (default: 1000) - if the ZIP file contains more files, it will be truncated @Value("${file.preview.zip.limit.length:1000}") private int maxPreviewCount; - - @Value("${file.preview.archive.thread.pool.size:#{null}}") - private Integer archiveThreadPoolSize; - - + @Autowired PreviewContentDAO previewContentDAO; @Autowired(required = true) @@ -306,122 +296,40 @@ private void addFilePath(List filePaths, String path, long size) { filePaths.add(fileInfo); } - /* - - * Processes a TAR file, extracting its entries and adding their paths and sizes to the provided list. - * Utilizes parallelism to process each TAR entry concurrently. - * - * @param filePaths the list to populate with the extracted file paths and sizes - * @param inputStream the InputStream containing the TAR file data - - - - - + /** + * Processes a TAR file, extracting its entries and adding their paths to the provided list. + * @param filePaths the list to populate with the extracted file paths + * @param inputStream the TAR file data * @throws IOException if an I/O error occurs while reading the TAR file - * @throws InterruptedException if the current thread is interrupted while waiting for the completion of a task - * @throws ExecutionException if an exception occurs during the execution of a parallel task */ - private void processTarFile(List filePaths, InputStream inputStream) - throws IOException, InterruptedException, ExecutionException { - List entries = new ArrayList<>(); - int threadPoolSize = (archiveThreadPoolSize != null) ? - archiveThreadPoolSize : Runtime.getRuntime().availableProcessors(); - ExecutorService executorService = Executors.newFixedThreadPool(threadPoolSize); - List>> futures = new ArrayList<>(); - - try (BufferedInputStream bufferedStream = new BufferedInputStream(inputStream); - TarArchiveInputStream tis = new TarArchiveInputStream(bufferedStream)) { - + private void processTarFile(List filePaths, InputStream inputStream) throws IOException { + try (TarArchiveInputStream tis = new TarArchiveInputStream(inputStream)) { TarArchiveEntry entry; while ((entry = tis.getNextTarEntry()) != null) { if (!entry.isDirectory()) { - entries.add(entry); + // Add the file path and its size (from the TAR entry) + addFilePath(filePaths, entry.getName(), entry.getSize()); } } - // Process sequentially if below threshold - if (entries.size() < archiveThreadPoolSize) { - for (TarArchiveEntry e : entries) { - addFilePath(filePaths, e.getName(), e.getSize()); - } - return; - } - // Process in parallel if above threshold - for (TarArchiveEntry e : entries) { - String entryName = e.getName(); - long fileSize = e.getSize(); - Callable> task = () -> { - List localPaths = new ArrayList<>(); - addFilePath(localPaths, entryName, fileSize); - return localPaths; - }; - futures.add(executorService.submit(task)); - } - for (Future> future : futures) { - filePaths.addAll(future.get()); // Thread-safe addition - } - } finally { - executorService.shutdown(); } } /** - - * Processes a ZIP file, extracting its entries and adding their paths and sizes to the provided list. - * Utilizes parallelism to process each ZIP entry concurrently. - * - * @param filePaths the list to populate with the extracted file paths and sizes - * @param inputStream the InputStream containing the ZIP file data - * @throws IOException if an I/O error occurs while reading the ZIP file - * @throws InterruptedException if the current thread is interrupted while waiting for the completion of a task - * @throws ExecutionException if an exception occurs during the execution of a parallel task - - - - - - + * Processes a ZIP file, extracting its entries and adding their paths to the provided list. + * @param filePaths the list to populate with the extracted file paths + * @param inputStream the ZIP file data + * @throws IOException if an I/O error occurs while reading the ZIP file */ - private void processZipFile(List filePaths, InputStream inputStream) - throws IOException, InterruptedException, ExecutionException { - List entries = new ArrayList<>(); - int threadPoolSize = (archiveThreadPoolSize != null) ? - archiveThreadPoolSize : Runtime.getRuntime().availableProcessors(); - ExecutorService executorService = Executors.newFixedThreadPool(threadPoolSize); - List>> futures = new ArrayList<>(); - - try (BufferedInputStream bufferedStream = new BufferedInputStream(inputStream); - ZipInputStream zipInputStream = new ZipInputStream(bufferedStream)) { - + private void processZipFile(List filePaths, InputStream inputStream) throws IOException { + try (ZipInputStream zipInputStream = new ZipInputStream(inputStream)) { ZipEntry entry; while ((entry = zipInputStream.getNextEntry()) != null) { if (!entry.isDirectory()) { - entries.add(entry); + // Add the file path and its size (from the ZIP entry) + long fileSize = entry.getSize(); + addFilePath(filePaths, entry.getName(), fileSize); } } - // Process sequentially if below threshold - if (entries.size() < archiveThreadPoolSize) { - for (ZipEntry e : entries) { - addFilePath(filePaths, e.getName(), e.getSize()); - } - return; - } - // Process in parallel if above threshold - for (ZipEntry e : entries) { - String entryName = e.getName(); - long fileSize = e.getSize(); - Callable> task = () -> { - List localPaths = new ArrayList<>(); - addFilePath(localPaths, entryName, fileSize); - return localPaths; - }; - futures.add(executorService.submit(task)); - } - for (Future> future : futures) { - filePaths.addAll(future.get()); // Thread-safe addition - } - } finally { - executorService.shutdown(); } } From 334a791c1b0c194ec6b589da11841b093a079f6f Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Mon, 14 Apr 2025 18:47:20 +0300 Subject: [PATCH 12/13] added safe way how to addFilePath --- .../dspace/content/PreviewContentServiceImpl.java | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java index 24ca9e925271..a05fc8bf954c 100644 --- a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java +++ b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java @@ -14,6 +14,8 @@ import java.io.UnsupportedEncodingException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; +import java.nio.file.InvalidPathException; +import java.nio.file.Path; import java.nio.file.Paths; import java.sql.SQLException; import java.util.ArrayList; @@ -70,7 +72,8 @@ public class PreviewContentServiceImpl implements PreviewContentService { // Configured ZIP file preview limit (default: 1000) - if the ZIP file contains more files, it will be truncated @Value("${file.preview.zip.limit.length:1000}") private int maxPreviewCount; - + + @Autowired PreviewContentDAO previewContentDAO; @Autowired(required = true) @@ -292,7 +295,15 @@ private Hashtable createSubMap(Map sourceMap, Funct * @param size the size of the file or directory */ private void addFilePath(List filePaths, String path, long size) { - String fileInfo = Files.isDirectory(Paths.get(path)) ? path + "/|" + size : path + "|" + size; + String fileInfo = ""; + try { + Path filePath = Paths.get(path); + boolean isDir = Files.isDirectory(filePath); + fileInfo = (isDir ? path + "/|" : path + "|") + size; + } catch (NullPointerException | InvalidPathException | SecurityException e) { + // Handle exception appropriately + throw e; + } filePaths.add(fileInfo); } From eced1ed6c07a6e9acc02684932eb49b20bc4f4c4 Mon Sep 17 00:00:00 2001 From: Paurikova2 Date: Tue, 15 Apr 2025 14:06:28 +0300 Subject: [PATCH 13/13] log error when path is not added properly --- .../java/org/dspace/content/PreviewContentServiceImpl.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java index a05fc8bf954c..371d5abf30b7 100644 --- a/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java +++ b/dspace-api/src/main/java/org/dspace/content/PreviewContentServiceImpl.java @@ -301,8 +301,7 @@ private void addFilePath(List filePaths, String path, long size) { boolean isDir = Files.isDirectory(filePath); fileInfo = (isDir ? path + "/|" : path + "|") + size; } catch (NullPointerException | InvalidPathException | SecurityException e) { - // Handle exception appropriately - throw e; + log.error(String.format("Failed to add file path. Path: '%s', Size: %d", path, size), e); } filePaths.add(fileInfo); }