Deduper.java

/*
 * Syncany, www.syncany.org
 * Copyright (C) 2011-2016 Philipp C. Heckel <philipp.heckel@gmail.com> 
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.syncany.chunk;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;

import org.syncany.chunk.Chunker.ChunkEnumeration;
import org.syncany.database.MultiChunkEntry.MultiChunkId;

/**
 * The Deduper implements the core deduplication algorithm used by Syncany. 
 * 
 * <p>The algorithm uses a {@link Chunker} to break files into individual
 * {@link Chunk}s. These chunks are added to a {@link MultiChunk} using an implementation
 * of a {@link MultiChunker}. Before this multichunk is written to a file, it is transformed
 * using one or many {@link Transformer}s (can be chained). 
 * 
 * <p>This class does not maintain a chunk index itself. Instead, it calls a listener to
 * lookup a chunk, and skips further chunk processing if the chunk already exists. 
 * 
 * <p>For a detailed description of the algorithm, please refer to chapter 5.3 of the thesis:
 * <i>"Minimizing remote storage usage and synchronization time using deduplication and
 * multichunking: Syncany as an example"</i>
 * 
 * @see <a href="http://blog.philippheckel.com/2013/05/20/minimizing-remote-storage-usage-and-synchronization-time-using-deduplication-and-multichunking-syncany-as-an-example/">Blog post: Minimizing remote storage usage and synchronization time using deduplication and multichunking: Syncany as an example</a>
 * @author Philipp C. Heckel (philipp.heckel@gmail.com)
 */
public class Deduper {	
	private Chunker chunker;
	private MultiChunker multiChunker;
	private Transformer transformer;
	private long maxTotalSize;
	private long maxNumberOfFiles;

	public Deduper(Chunker chunker, MultiChunker multiChunker, Transformer transformer, long maxTotalSize, long maxNumberOfFiles) {
		this.chunker = chunker;
		this.multiChunker = multiChunker;
		this.transformer = transformer;
		this.maxTotalSize = maxTotalSize;
		this.maxNumberOfFiles = maxNumberOfFiles;
	}
	
	/**
	 * Deduplicates the given list of files according to the Syncany chunk algorithm. 
	 * 
	 * <p>A brief description of the algorithm (and further links to a detailed description)
	 * are given in the {@link Deduper}.
	 *  	
	 * @param files List of files to be deduplicated (will be modified!)
	 * @param listener Listener to react of file/chunk/multichunk events, and to implement the chunk index
	 * @throws IOException If a file cannot be read or an unexpected exception occurs
	 */
	public void deduplicate(List<File> files, DeduperListener listener) throws IOException {
		Chunk chunk = null;
		MultiChunk multiChunk = null;
		long totalMultiChunkSize = 0L;
		long totalNumFiles = 0L;
		
		while (!files.isEmpty()) {
			File file = files.remove(0);
			totalNumFiles++;
			
			// Filter ignored files
			boolean fileAccepted = listener.onFileFilter(file);
			
			if (!fileAccepted) {
				continue;
			}
			
			// Decide whether to index the contents
			boolean dedupContents = listener.onFileStart(file);

			if (dedupContents) {
				// Create chunks from file
				ChunkEnumeration chunksEnum = chunker.createChunks(file);

				while (chunksEnum.hasMoreElements()) {
					chunk = chunksEnum.nextElement();

					// old chunk
					if (!listener.onChunk(chunk)) {
						listener.onFileAddChunk(file, chunk);
						continue;
					}

					// new chunk
					else {					
						// - Check if multichunk full
						if (multiChunk != null && multiChunk.isFull()) {
							totalMultiChunkSize += multiChunk.getSize();
							multiChunk.close();
							listener.onMultiChunkClose(multiChunk);

							multiChunk = null;
						}

						// - Open new multichunk if non-existent
						if (multiChunk == null) {
							MultiChunkId newMultiChunkId = listener.createNewMultiChunkId(chunk);
							File multiChunkFile = listener.getMultiChunkFile(newMultiChunkId);
							
							multiChunk = multiChunker.createMultiChunk(newMultiChunkId, 
								transformer.createOutputStream(new FileOutputStream(multiChunkFile)));

							listener.onMultiChunkOpen(multiChunk);
						}

						// - Add chunk data
						multiChunk.write(chunk);						
						listener.onMultiChunkWrite(multiChunk, chunk);						
					}

					listener.onFileAddChunk(file, chunk);										
				}

				// Closing file is necessary!
				chunksEnum.close();

			}

			if (chunk != null) {			
				listener.onFileEnd(file, chunk.getFileChecksum());
			}
			else {
				listener.onFileEnd(file, null);
			}
			
			// Reset chunk (if folder after chunk, the folder would have a checksum b/c of chunk.getFileChecksum())
			chunk = null;

			// Check if we have reached the transaction limit
			if (multiChunk != null) {
				if (totalMultiChunkSize + multiChunk.getSize() >= maxTotalSize || totalNumFiles >= maxNumberOfFiles) {
					multiChunk.close();
					listener.onMultiChunkClose(multiChunk);
					return;
				}
			}
			else if (totalMultiChunkSize >= maxTotalSize || totalNumFiles >= maxNumberOfFiles) {
				return;
			}
		}

		// Close and add last multichunk
		if (multiChunk != null) {
			// Data
			multiChunk.close();
			listener.onMultiChunkClose(multiChunk);

			multiChunk = null;
		}
		
		listener.onFinish();

		return;
	}	
}