Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
e633052
feat: add support for using python-udf
jesrypandawa Apr 8, 2022
aacfd47
fix: fix sample python config properties
jesrypandawa Apr 8, 2022
165c283
feat: add config mapper
jesrypandawa Apr 12, 2022
e9c66a3
fix: register the python config one by one without map
jesrypandawa Apr 19, 2022
d6fed9d
fix: fix sample config
jesrypandawa Apr 19, 2022
339f435
chore: bump up version to 0.2.7
jesrypandawa Apr 19, 2022
c4477fe
fix: adding exception class and fix test
jesrypandawa Apr 19, 2022
37c9f46
fix: change primitive type for python udf config
jesrypandawa Apr 20, 2022
85e978f
fix: fix python file test
jesrypandawa Apr 24, 2022
6661d74
fix: fix python exception
jesrypandawa Apr 25, 2022
fff30de
fix: fix python test read zip exclude text file
jesrypandawa Apr 27, 2022
0a168ca
fix: move register python config inside if statement
jesrypandawa Apr 27, 2022
5631780
fix: enable to read file from gcs on local
jesrypandawa Apr 28, 2022
5e4838b
Merge branch 'feat/add-support-python-udf' into python-udf
jesrypandawa Apr 28, 2022
f450bac
fix: create abstraction for python file types and sources
jesrypandawa May 23, 2022
e3ca695
chore: remove whitespace
jesrypandawa May 24, 2022
245177a
refactor: refactor python factory class
jesrypandawa May 25, 2022
5fec7db
fix: add exception on interface method and add more test
jesrypandawa May 30, 2022
c995a58
test: add gcs client test
jesrypandawa May 30, 2022
1c15744
feat: add python workflow (#155)
jesrypandawa May 30, 2022
f7f6ae3
Revert "feat: add python workflow (#155)" (#159)
MayurGubrele May 30, 2022
dd800b6
chore: replace test files and refactor test
jesrypandawa May 31, 2022
9571e5e
Merge branch 'feat/add-support-python-udf' into python-udf
jesrypandawa Jun 2, 2022
f87c793
fix: refactor python files and python_release
jesrypandawa Jun 2, 2022
aa18b52
refactor: move hadoop libraries to common
prakharmathur82 Jun 8, 2022
2e1f15e
chore: refactor python files structure and sample udf
jesrypandawa Jun 8, 2022
7404ae4
refactor: refactor zip structure
jesrypandawa Jun 9, 2022
f239da9
fix: fix data zip structure
jesrypandawa Jun 9, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .github/workflows/python_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@ jobs:
- uses: actions/checkout@v2
- name: Zip Python Udf
run: |
zip -r python_udfs.zip dagger-py-functions/udfs -x "*/__init__.py"
zip -r dagger-py-functions.zip python_functions/requirements.txt python_functions/data python_udfs.zip
cd dagger-py-functions
zip -r python_udfs.zip udfs -x "*/__init__.py"
zip -jr data.zip data
zip -r dagger-py-functions.zip requirements.txt data.zip python_udfs.zip
- name: Upload Release
uses: ncipollo/release-action@v1
with:
Expand Down
4 changes: 4 additions & 0 deletions dagger-common/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ dependencies {
compileOnly group: 'org.apache.flink', name: 'flink-table-api-java-bridge_2.11', version: flinkVersion
compileOnly group: 'org.apache.flink', name: 'flink-connector-kafka_2.11', version: flinkVersion

dependenciesCommonJar ('org.apache.hadoop:hadoop-client:2.8.3') {
exclude module:"commons-cli"
}
dependenciesCommonJar 'com.google.cloud.bigdataoss:gcs-connector:1.9.0-hadoop2'
dependenciesCommonJar 'org.apache.flink:flink-metrics-dropwizard:' + flinkVersion
dependenciesCommonJar 'org.apache.flink:flink-json:' + flinkVersion
dependenciesCommonJar 'com.jayway.jsonpath:json-path:2.4.0'
Expand Down
2 changes: 1 addition & 1 deletion dagger-functions/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ dependencies {
dependenciesFunctionsJar group: 'org.apache.commons', name: 'commons-jexl3', version: '3.1'
dependenciesFunctionsJar group: 'org.isuper', name: 's2-geometry-library-java', version: '0.0.1'
dependenciesFunctionsJar group: 'com.google.cloud', name: 'google-cloud-storage', version: '1.67.0'

testImplementation project(':dagger-common').sourceSets.test.output
testImplementation group: 'junit', name: 'junit', version: '4.12'
testImplementation 'org.mockito:mockito-core:2.0.99-beta'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package io.odpf.dagger.functions.exceptions;

/**
* The type Python files empty exception.
*/
public class PythonFilesEmptyException extends RuntimeException {

/**
* Instantiates a new Python files empty exception.
*
* @param message the message
*/
public PythonFilesEmptyException(String message) {
super(message);
}

}

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,23 @@

import static io.odpf.dagger.functions.common.Constants.*;

/**
* The type Python udf config.
*/
public class PythonUdfConfig {
private static final Gson GSON = new GsonBuilder()
.enableComplexMapKeySerialization()
.setPrettyPrinting()
.create();

@SerializedName(PYTHON_FILES_KEY)
@Getter
private String pythonFiles;

@SerializedName(PYTHON_REQUIREMENTS_KEY)
@Getter
private String pythonRequirements;

@SerializedName(PYTHON_ARCHIVES_KEY)
@Getter
private String pythonArchives;

@SerializedName(PYTHON_FN_EXECUTION_ARROW_BATCH_SIZE_KEY)
Expand All @@ -35,27 +36,72 @@ public class PythonUdfConfig {
@SerializedName(PYTHON_FN_EXECUTION_BUNDLE_TIME_KEY)
private Long pythonBundleTime;

/**
* Gets python files.
*
* @return the python files
*/
public String getPythonFiles() {
if (pythonFiles != null) {
return pythonFiles.replaceAll("\\s+", "");
}
return null;
}

/**
* Gets python archives.
*
* @return the python archives
*/
public String getPythonArchives() {
if (pythonArchives != null) {
return pythonArchives.replaceAll("\\s+", "");
}
return null;
}

/**
* Gets python arrow batch size.
*
* @return the python arrow batch size
*/
public int getPythonArrowBatchSize() {
if (pythonArrowBatchSize == null) {
return PYTHON_FN_EXECUTION_ARROW_BATCH_SIZE_DEFAULT;
}
return pythonArrowBatchSize;
}

/**
* Gets python bundle size.
*
* @return the python bundle size
*/
public int getPythonBundleSize() {
if (pythonBundleSize == null) {
return PYTHON_FN_EXECUTION_BUNDLE_SIZE_DEFAULT;
}
return pythonBundleSize;
}

/**
* Gets python bundle time.
*
* @return the python bundle time
*/
public long getPythonBundleTime() {
if (pythonBundleTime == null) {
return PYTHON_FN_EXECUTION_BUNDLE_TIME_DEFAULT;
}
return pythonBundleTime;
}

/**
* Parse python udf config.
*
* @param configuration the configuration
* @return the python udf config
*/
public static PythonUdfConfig parse(Configuration configuration) {
String jsonString = configuration.getString(PYTHON_UDF_CONFIG, "");

Expand Down
Original file line number Diff line number Diff line change
@@ -1,55 +1,51 @@
package io.odpf.dagger.functions.udfs.python;

import io.odpf.dagger.functions.exceptions.PythonFilesFormatException;
import io.odpf.dagger.functions.exceptions.PythonFilesNotFoundException;
import io.odpf.dagger.functions.exceptions.PythonFilesEmptyException;
import io.odpf.dagger.functions.udfs.python.file.type.FileType;
import io.odpf.dagger.functions.udfs.python.file.type.FileTypeFactory;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;

import java.io.IOException;
import java.util.Enumeration;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.ArrayList;
import java.util.List;

/**
* The type Python udf manager.
*/
public class PythonUdfManager {

private StreamTableEnvironment tableEnvironment;
private PythonUdfConfig pythonUdfConfig;

/**
* Instantiates a new Python udf manager.
*
* @param tableEnvironment the table environment
* @param pythonUdfConfig the python udf config
*/
public PythonUdfManager(StreamTableEnvironment tableEnvironment, PythonUdfConfig pythonUdfConfig) {
this.tableEnvironment = tableEnvironment;
this.pythonUdfConfig = pythonUdfConfig;
}

/**
* Register python functions.
*/
public void registerPythonFunctions() throws IOException {

String inputFiles = pythonUdfConfig.getPythonFiles();
String[] pythonFilesSource;
String[] pythonFiles;
if (inputFiles != null) {
registerPythonConfig();
pythonFilesSource = inputFiles.split(",");
pythonFiles = inputFiles.split(",");
} else {
throw new PythonFilesNotFoundException("Python files not found");
throw new PythonFilesEmptyException("Python files can not be null");
}

for (String pythonFile : pythonFilesSource) {
if (pythonFile.contains(".zip")) {
ZipFile zf = new ZipFile(pythonFile);
for (Enumeration e = zf.entries(); e.hasMoreElements();) {
ZipEntry entry = (ZipEntry) e.nextElement();
String name = entry.getName();
if (name.endsWith(".py")) {
name = name.replace(".py", "").replace("/", ".");
String udfName = name.substring(name.lastIndexOf(".") + 1);
String query = "CREATE TEMPORARY FUNCTION " + udfName.toUpperCase() + " AS '" + name + "." + udfName + "' LANGUAGE PYTHON";
tableEnvironment.executeSql(query);
}
}
} else if (pythonFile.contains(".py")) {
String name = pythonFile.substring(pythonFile.lastIndexOf('/') + 1).replace(".py", "");
String query = "CREATE TEMPORARY FUNCTION " + name.toUpperCase() + " AS '" + name + "." + name + "' LANGUAGE PYTHON";
tableEnvironment.executeSql(query);
} else {
throw new PythonFilesFormatException("Python files should be in .py or .zip format");
}
for (String pythonFile : pythonFiles) {
FileType fileType = FileTypeFactory.getFileType(pythonFile);
List<String> fileNames = fileType.getFileNames();
List<String> sqlQueries = createQuery(fileNames);
executeSql(sqlQueries);
}
}

Expand All @@ -65,4 +61,21 @@ private void registerPythonConfig() {
tableEnvironment.getConfig().getConfiguration().setInteger("python.fn-execution.bundle.size", pythonUdfConfig.getPythonBundleSize());
tableEnvironment.getConfig().getConfiguration().setLong("python.fn-execution.bundle.time", pythonUdfConfig.getPythonBundleTime());
}

private void executeSql(List<String> sqlQueries) {
for (String query : sqlQueries) {
tableEnvironment.executeSql(query);
}
}

private List<String> createQuery(List<String> fileNames) {
List<String> sqlQueries = new ArrayList<>();
for (String fileName : fileNames) {
fileName = fileName.replace(".py", "").replace("/", ".");
String functionName = fileName.substring(fileName.lastIndexOf(".") + 1);
String query = "CREATE TEMPORARY FUNCTION " + functionName.toUpperCase() + " AS '" + fileName + "." + functionName + "' LANGUAGE PYTHON";
sqlQueries.add(query);
}
return sqlQueries;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package io.odpf.dagger.functions.udfs.python.file.source;

import java.io.IOException;

/**
* The interface File source.
*/
public interface FileSource {

/**
* Get object file byte [ ].
*
* @return the byte [ ]
*/
byte[] getObjectFile() throws IOException;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package io.odpf.dagger.functions.udfs.python.file.source;

import io.odpf.dagger.functions.udfs.python.file.source.gcs.GcsFileSource;
import io.odpf.dagger.functions.udfs.python.file.source.local.LocalFileSource;

/**
* The type File source factory.
*/
public class FileSourceFactory {

/**
* Gets file source.
*
* @param pythonFile the python file
* @return the file source
*/
public static FileSource getFileSource(String pythonFile) {
if ("GS".equals(getFileSourcePrefix(pythonFile))) {
return new GcsFileSource(pythonFile);
} else {
return new LocalFileSource(pythonFile);
}
}

private static String getFileSourcePrefix(String pythonFile) {
String[] files = pythonFile.split("://");
return files[0].toUpperCase();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package io.odpf.dagger.functions.udfs.python.file.source.gcs;

import com.google.cloud.storage.Blob;
import com.google.cloud.storage.BlobId;
import com.google.cloud.storage.Storage;
import com.google.cloud.storage.StorageOptions;

import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;

/**
* The type Gcs client.
*/
public class GcsClient {

private Storage storage;

/**
* Instantiates a new Gcs client.
*/
public GcsClient() {

if (storage == null) {
storage = StorageOptions.newBuilder()
.build().getService();
}
}

/**
* Instantiates a new Gcs client.
* This constructor used for unit test purposes.
*
* @param storage the storage
*/
public GcsClient(Storage storage) {
this.storage = storage;
}

/**
* Get file byte [ ].
*
* @param pythonFile the python file
* @return the byte [ ]
*/
public byte[] getFile(String pythonFile) {
List<String> file = Arrays.asList(pythonFile.replace("gs://", "").split("/"));

String bucketName = file.get(0);
String objectName = file.stream().skip(1).collect(Collectors.joining("/"));

Blob blob = storage.get(BlobId.of(bucketName, objectName));

return blob.getContent();
}
}
Loading