AWS S3 Select API Java Example

The following is an example of how to perform an S3 Select query via SQL against an S3 bucket.

AWS S3 Select Maven Pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.codetinkering</groupId>
    <artifactId>s3selectexample</artifactId>
    <version>1.0-SNAPSHOT</version>
    <packaging>jar</packaging>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
    </properties>

    <dependencyManagement>
        <dependencies>
            <dependency>
                <groupId>com.amazonaws</groupId>
                <artifactId>aws-java-sdk-bom</artifactId>
                <version>1.12.74</version>
                <type>pom</type>
                <scope>import</scope>
            </dependency>
        </dependencies>
    </dependencyManagement>

    <dependencies>
        <dependency>
            <groupId>com.amazonaws</groupId>
            <artifactId>aws-java-sdk-s3</artifactId>
        </dependency>
        <dependency>
            <groupId>com.amazonaws</groupId>
            <artifactId>aws-java-sdk-core</artifactId>
        </dependency>
    </dependencies>

</project>

S3 Select Java Example

Change the following bucket name, directory name, and search query parameters to match your needs.

import com.amazonaws.AmazonClientException;
import com.amazonaws.auth.profile.ProfileCredentialsProvider;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.AmazonS3ClientBuilder;
import com.amazonaws.services.s3.model.*;
import com.amazonaws.util.IOUtils;

import java.io.IOException;
import java.io.InputStream;

public class S3SelectExample {

    public static void main(String[] args) {


        String bucketName = "mybucketname";
        String directoryName = "mydirectoryname/";
        String searchQuery = "SELECT * FROM S3Object s";

        ProfileCredentialsProvider credentialsProvider = new ProfileCredentialsProvider("AWSprofileName");

        try {
            credentialsProvider.getCredentials();
        } catch (Exception e) {
            throw new AmazonClientException("Failed to load credential profile. Please check that your file exists and is located at (~/.aws/credentials)");
        }

        AmazonS3 s3Client = AmazonS3ClientBuilder.standard()
                .withCredentials(credentialsProvider)
                .withRegion("us-east-1")
                .build();

        // Fetch the list of files in this directory
        ObjectListing objectListing = s3Client.listObjects(bucketName, directoryName);

        // For each file, get the summary
        for (S3ObjectSummary summary : objectListing.getObjectSummaries()) {

            SelectObjectContentRequest request = new SelectObjectContentRequest();
            request.setBucketName(bucketName);
            request.setKey(summary.getKey());
            request.setExpression(searchQuery);
            request.setExpressionType(ExpressionType.SQL);

            InputSerialization inputSerialization = new InputSerialization();
            //inputSerialization.setParquet(new ParquetInput()); // Uncomment this for Parquet format input file
            //inputSerialization.setJson(new JSONInput()); // Uncomment this for JSON format input file
            inputSerialization.setCsv(new CSVInput()); // Using CSV input format on the file

            OutputSerialization outputSerialization = new OutputSerialization();
            outputSerialization.setJson(new JSONOutput());

            request.setInputSerialization(inputSerialization);
            request.setOutputSerialization(outputSerialization);

            try (SelectObjectContentResult result = s3Client.selectObjectContent(request)) {

                System.out.println("Results: " + result);

                // Perform some per-stream output of details of current query
                try (InputStream resultInputStream = result.getPayload().getRecordsInputStream(
                        new SelectObjectContentEventVisitor() {
                            @Override
                            public void visit(SelectObjectContentEvent.StatsEvent event) {
                                System.out.println("Received stats, bytes scanned" + event.getDetails().getBytesScanned()
                                        + " bytes processed" + event.getDetails().getBytesProcessed());
                            }

                            @Override
                            public void visit(SelectObjectContentEvent.EndEvent event) {
                                System.out.println("Result is complete");
                            }
                        }
                )) {
                    //Print results to console
                    // TODO: Do something with the results here.
                    System.out.println(new String(IOUtils.toByteArray(resultInputStream)).trim());
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}