diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..9aa782b59aeff463e1d8036ffad8b769c2c847be --- /dev/null +++ b/Dockerfile @@ -0,0 +1,35 @@ +ARG BASE_REGISTRY=repo1.dso.mil +ARG BASE_IMAGE=ironbank/opensource/python/python38 +ARG BASE_TAG=3.8 + +FROM spark-operator/spark-py:2.4.4 as base + +FROM ${BASE_REGISTRY}/${BASE_IMAGE}:${BASE_TAG} + + +ARG spark_uid=185 + +USER root + +COPY pip-21.0.1-py3-none-any.whl setuptools-53.0.0-py3-none-any.whl ./ + +RUN dnf -y update && dnf -y upgrade && \ + pip3 install --upgrade ./pip-21.0.1-py3-none-any.whl ./setuptools-53.0.0-py3-none-any.whl && \ + rm -rf /var/cache/dnf && \ + mkdir -p /opt/spark/python/pyspark && \ + mkdir -p /opt/spark/python/lib && \ + chown -R 185:185 /opt/spark/ + +#COPY --from=base /opt/spark/python/pyspark /opt/spark/python/pyspark +COPY --from=base /opt/spark/python/lib /opt/spark/python/lib + + +ENV SPARK_HOME /opt/spark + +WORKDIR /opt/spark/work-dir +RUN chmod g+w /opt/spark/work-dir + +ENTRYPOINT [ "/opt/entrypoint.sh" ] + +# Specify the User that the actual main process will run as +USER ${spark_uid} diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..d645695673349e3947e8e5ae42332d0ac3164cd7 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index 5dc6fa6db4361c22da2f35edf0544d83ba6001e2..aa7d1dd338be04d9b69b068e3e2d7330ce3b1da6 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,108 @@ -# +# Apache Spark -Project template for all Iron Bank container repositories. \ No newline at end of file +Spark is a unified analytics engine for large-scale data processing. It provides +high-level APIs in Scala, Java, Python, and R, and an optimized engine that +supports general computation graphs for data analysis. It also supports a +rich set of higher-level tools including Spark SQL for SQL and DataFrames, +MLlib for machine learning, GraphX for graph processing, +and Structured Streaming for stream processing. + + + +[![Jenkins Build](https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-3.2/badge/icon)](https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-3.2) +[![AppVeyor Build](https://img.shields.io/appveyor/ci/ApacheSoftwareFoundation/spark/master.svg?style=plastic&logo=appveyor)](https://ci.appveyor.com/project/ApacheSoftwareFoundation/spark) +[![PySpark Coverage](https://img.shields.io/badge/dynamic/xml.svg?label=pyspark%20coverage&url=https%3A%2F%2Fspark-test.github.io%2Fpyspark-coverage-site&query=%2Fhtml%2Fbody%2Fdiv%5B1%5D%2Fdiv%2Fh1%2Fspan&colorB=brightgreen&style=plastic)](https://spark-test.github.io/pyspark-coverage-site) + + +## Online Documentation + +You can find the latest Spark documentation, including a programming +guide, on the [project web page](https://spark.apache.org/documentation.html). +This README file only contains basic setup instructions. + +## Building Spark + +Spark is built using [Apache Maven](https://maven.apache.org/). +To build Spark and its example programs, run: + + ./build/mvn -DskipTests clean package + +(You do not need to do this if you downloaded a pre-built package.) + +More detailed documentation is available from the project site, at +["Building Spark"](https://spark.apache.org/docs/latest/building-spark.html). + +For general development tips, including info on developing Spark using an IDE, see ["Useful Developer Tools"](https://spark.apache.org/developer-tools.html). + +## Interactive Scala Shell + +The easiest way to start using Spark is through the Scala shell: + + ./bin/spark-shell + +Try the following command, which should return 1,000,000,000: + + scala> spark.range(1000 * 1000 * 1000).count() + +## Interactive Python Shell + +Alternatively, if you prefer Python, you can use the Python shell: + + ./bin/pyspark + +And run the following command, which should also return 1,000,000,000: + + >>> spark.range(1000 * 1000 * 1000).count() + +## Example Programs + +Spark also comes with several sample programs in the `examples` directory. +To run one of them, use `./bin/run-example [params]`. For example: + + ./bin/run-example SparkPi + +will run the Pi example locally. + +You can set the MASTER environment variable when running examples to submit +examples to a cluster. This can be a mesos:// or spark:// URL, +"yarn" to run on YARN, and "local" to run +locally with one thread, or "local[N]" to run locally with N threads. You +can also use an abbreviated class name if the class is in the `examples` +package. For instance: + + MASTER=spark://host:7077 ./bin/run-example SparkPi + +Many of the example programs print usage help if no params are given. + +## Running Tests + +Testing first requires [building Spark](#building-spark). Once Spark is built, tests +can be run using: + + ./dev/run-tests + +Please see the guidance on how to +[run tests for a module, or individual tests](https://spark.apache.org/developer-tools.html#individual-tests). + +There is also a Kubernetes integration test, see resource-managers/kubernetes/integration-tests/README.md + +## A Note About Hadoop Versions + +Spark uses the Hadoop core library to talk to HDFS and other Hadoop-supported +storage systems. Because the protocols have changed in different versions of +Hadoop, you must build Spark against the same version that your cluster runs. + +Please refer to the build documentation at +["Specifying the Hadoop Version and Enabling YARN"](https://spark.apache.org/docs/latest/building-spark.html#specifying-the-hadoop-version-and-enabling-yarn) +for detailed guidance on building for a particular distribution of Hadoop, including +building for particular Hive and Hive Thriftserver distributions. + +## Configuration + +Please refer to the [Configuration Guide](https://spark.apache.org/docs/latest/configuration.html) +in the online documentation for an overview on how to configure Spark. + +## Contributing + +Please review the [Contribution to Spark guide](https://spark.apache.org/contributing.html) +for information on how to get started contributing to the project. diff --git a/hardening_manifest.yaml b/hardening_manifest.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0c420ac774290518aeb91eb7e2d2e6d7ae5784dd --- /dev/null +++ b/hardening_manifest.yaml @@ -0,0 +1,60 @@ +--- +apiVersion: v1 + +# The repository name in registry1, excluding /ironbank/ +name: "opensource/spark-operator/spark-py" + +# List of tags to push for the repository in registry1 +# The most specific version should be the first tag and will be shown +# on ironbank.dsop.io +tags: +- "2.4.4" +- "latest" + +# Build args passed to Dockerfile ARGs +args: + BASE_IMAGE: "opensource/python/python38" + BASE_TAG: "3.8" + +# Docker image labels +labels: + org.opencontainers.image.title: "Python Spark" + # Human-readable description of the software packaged in the image + org.opencontainers.image.description: "Apache Spark is a unified analytics engine for large-scale data processing." + # License(s) under which contained software is distributed + org.opencontainers.image.licenses: "Apache-2.0" + # URL to find more information on the image + org.opencontainers.image.url: "https://spark.apache.org/docs/latest/index.html" + # Name of the distributing entity, organization or individual + org.opencontainers.image.vendor: "Apache Software Foundation" + org.opencontainers.image.version: "2.4.4" + # Keywords to help with search (ex. "cicd,gitops,golang") + mil.dso.ironbank.image.keywords: "apache,python,data processing,large scale,hadoop,kubeflow" + # This value can be "opensource" or "commercial" + mil.dso.ironbank.image.type: "opensource" + # Product the image belongs to for grouping multiple images + mil.dso.ironbank.product.name: "spark-operator" + +# List of resources to make available to the offline build context +resources: + - url: "docker://gcr.io/spark-operator/spark-py@sha256:f4980fb33077ae6e03329cf6f835ec671c1b99de391ef9494e28ed19cf3de298" + tag: "spark-operator/spark-py:2.4.4" + - filename: pip-21.0.1-py3-none-any.whl + url: https://files.pythonhosted.org/packages/fe/ef/60d7ba03b5c442309ef42e7d69959f73aacccd0d86008362a681c4698e83/pip-21.0.1-py3-none-any.whl + validation: + type: sha256 + value: 37fd50e056e2aed635dec96594606f0286640489b0db0ce7607f7e51890372d5 + - filename: setuptools-53.0.0-py3-none-any.whl + url: https://files.pythonhosted.org/packages/15/0e/255e3d57965f318973e417d5b7034223f1223de500d91b945ddfaef42a37/setuptools-53.0.0-py3-none-any.whl + validation: + type: sha256 + value: 0e86620d658c5ca87a71a283bd308fcaeb4c33e17792ef6f081aec17c171347f + +# List of project maintainers +maintainers: +- email: "jweatherford@oteemo.com" + # The name of the current container owner + name: "Jeff Weatherford" + # The gitlab username of the current container owner + username: "jweatherford" + cht_member: true