AWS S3 Select API Java Example

Published
Updated

The following is an example of how to perform an S3 Select query via SQL against an S3 bucket.

AWS S3 Select Maven Pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.codetinkering</groupId>
    <artifactId>s3selectexample</artifactId>
    <version>1.0-SNAPSHOT</version>
    <packaging>jar</packaging>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
    </properties>

    <dependencyManagement>
        <dependencies>
            <dependency>
                <groupId>com.amazonaws</groupId>
                <artifactId>aws-java-sdk-bom</artifactId>
                <version>1.12.522</version>
                <type>pom</type>
                <scope>import</scope>
            </dependency>
        </dependencies>
    </dependencyManagement>

    <dependencies>
        <dependency>
            <groupId>com.amazonaws</groupId>
            <artifactId>aws-java-sdk-s3</artifactId>
        </dependency>
        <dependency>
            <groupId>com.amazonaws</groupId>
            <artifactId>aws-java-sdk-core</artifactId>
        </dependency>
    </dependencies>

</project>

S3 Select Java Example

Change the following bucket name, directory name, and search query parameters to match your needs.

import com.amazonaws.AmazonClientException;
import com.amazonaws.auth.profile.ProfileCredentialsProvider;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.AmazonS3ClientBuilder;
import com.amazonaws.services.s3.model.*;
import com.amazonaws.util.IOUtils;

import java.io.IOException;
import java.io.InputStream;

public class S3SelectExample {

    public static void main(String[] args) {


        String bucketName = "mybucketname";
        String directoryName = "mydirectoryname/";
        String searchQuery = "SELECT * FROM S3Object s";

        ProfileCredentialsProvider credentialsProvider = new ProfileCredentialsProvider("AWSprofileName");

        try {
            credentialsProvider.getCredentials();
        } catch (Exception e) {
            throw new AmazonClientException("Failed to load credential profile. Please check that your file exists and is located at (~/.aws/credentials)");
        }

        AmazonS3 s3Client = AmazonS3ClientBuilder.standard()
                .withCredentials(credentialsProvider)
                .withRegion("us-east-1")
                .build();

        // Fetch the list of files in this directory
        ObjectListing objectListing = s3Client.listObjects(bucketName, directoryName);

        // For each file, get the summary
        for (S3ObjectSummary summary : objectListing.getObjectSummaries()) {

            SelectObjectContentRequest request = new SelectObjectContentRequest();
            request.setBucketName(bucketName);
            request.setKey(summary.getKey());
            request.setExpression(searchQuery);
            request.setExpressionType(ExpressionType.SQL);

            InputSerialization inputSerialization = new InputSerialization();
            //inputSerialization.setParquet(new ParquetInput()); // Uncomment this for Parquet format input file
            //inputSerialization.setJson(new JSONInput()); // Uncomment this for JSON format input file
            inputSerialization.setCsv(new CSVInput()); // Using CSV input format on the file

            OutputSerialization outputSerialization = new OutputSerialization();
            outputSerialization.setJson(new JSONOutput());

            request.setInputSerialization(inputSerialization);
            request.setOutputSerialization(outputSerialization);

            try (SelectObjectContentResult result = s3Client.selectObjectContent(request)) {

                System.out.println("Results: " + result);

                // Perform some per-stream output of details of current query
                try (InputStream resultInputStream = result.getPayload().getRecordsInputStream(
                        new SelectObjectContentEventVisitor() {
                            @Override
                            public void visit(SelectObjectContentEvent.StatsEvent event) {
                                System.out.println("Received stats, bytes scanned" + event.getDetails().getBytesScanned()
                                        + " bytes processed" + event.getDetails().getBytesProcessed());
                            }

                            @Override
                            public void visit(SelectObjectContentEvent.EndEvent event) {
                                System.out.println("Result is complete");
                            }
                        }
                )) {
                    //Print results to console
                    // TODO: Do something with the results here.
                    System.out.println(new String(IOUtils.toByteArray(resultInputStream)).trim());
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}