uast.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504
  1. package hercules
  2. import (
  3. "bytes"
  4. "context"
  5. "errors"
  6. "fmt"
  7. "io"
  8. goioutil "io/ioutil"
  9. "os"
  10. "path"
  11. "runtime"
  12. "strings"
  13. "sync"
  14. "time"
  15. "github.com/gogo/protobuf/proto"
  16. "github.com/jeffail/tunny"
  17. "gopkg.in/bblfsh/client-go.v2"
  18. "gopkg.in/bblfsh/sdk.v1/protocol"
  19. "gopkg.in/bblfsh/sdk.v1/uast"
  20. "gopkg.in/src-d/enry.v1"
  21. "gopkg.in/src-d/go-git.v4"
  22. "gopkg.in/src-d/go-git.v4/plumbing"
  23. "gopkg.in/src-d/go-git.v4/plumbing/object"
  24. "gopkg.in/src-d/go-git.v4/utils/ioutil"
  25. "gopkg.in/src-d/go-git.v4/utils/merkletrie"
  26. "gopkg.in/src-d/hercules.v3/pb"
  27. )
  28. type UASTExtractor struct {
  29. Endpoint string
  30. Context func() (context.Context, context.CancelFunc)
  31. PoolSize int
  32. Languages map[string]bool
  33. FailOnErrors bool
  34. ProcessedFiles map[string]int
  35. clients []*bblfsh.Client
  36. pool *tunny.WorkPool
  37. }
  38. const (
  39. UAST_EXTRACTION_SKIPPED = -(1 << 31)
  40. ConfigUASTEndpoint = "ConfigUASTEndpoint"
  41. ConfigUASTTimeout = "ConfigUASTTimeout"
  42. ConfigUASTPoolSize = "ConfigUASTPoolSize"
  43. ConfigUASTFailOnErrors = "ConfigUASTFailOnErrors"
  44. ConfigUASTLanguages = "ConfigUASTLanguages"
  45. )
  46. type uastTask struct {
  47. Client *bblfsh.Client
  48. Lock *sync.RWMutex
  49. Dest map[plumbing.Hash]*uast.Node
  50. File *object.File
  51. Errors *[]error
  52. Status chan int
  53. }
  54. type worker struct {
  55. Client *bblfsh.Client
  56. Job func(interface{}) interface{}
  57. }
  58. func (w worker) TunnyReady() bool {
  59. return true
  60. }
  61. func (w worker) TunnyJob(data interface{}) interface{} {
  62. task := data.(uastTask)
  63. task.Client = w.Client
  64. return w.Job(task)
  65. }
  66. func (exr *UASTExtractor) Name() string {
  67. return "UAST"
  68. }
  69. func (exr *UASTExtractor) Provides() []string {
  70. arr := [...]string{"uasts"}
  71. return arr[:]
  72. }
  73. func (exr *UASTExtractor) Requires() []string {
  74. arr := [...]string{"changes", "blob_cache"}
  75. return arr[:]
  76. }
  77. func (exr *UASTExtractor) Features() []string {
  78. arr := [...]string{"uast"}
  79. return arr[:]
  80. }
  81. func (exr *UASTExtractor) ListConfigurationOptions() []ConfigurationOption {
  82. options := [...]ConfigurationOption{{
  83. Name: ConfigUASTEndpoint,
  84. Description: "How many days there are in a single band.",
  85. Flag: "bblfsh",
  86. Type: StringConfigurationOption,
  87. Default: "0.0.0.0:9432"}, {
  88. Name: ConfigUASTTimeout,
  89. Description: "Babelfish's server timeout in seconds.",
  90. Flag: "bblfsh-timeout",
  91. Type: IntConfigurationOption,
  92. Default: 20}, {
  93. Name: ConfigUASTPoolSize,
  94. Description: "Number of goroutines to extract UASTs.",
  95. Flag: "bblfsh-pool-size",
  96. Type: IntConfigurationOption,
  97. Default: runtime.NumCPU()}, {
  98. Name: ConfigUASTFailOnErrors,
  99. Description: "Panic if there is a UAST extraction error.",
  100. Flag: "bblfsh-fail-on-error",
  101. Type: BoolConfigurationOption,
  102. Default: false}, {
  103. Name: ConfigUASTLanguages,
  104. Description: "Programming languages from which to extract UASTs. Separated by comma \",\".",
  105. Flag: "languages",
  106. Type: StringConfigurationOption,
  107. Default: "Python,Java"},
  108. }
  109. return options[:]
  110. }
  111. func (exr *UASTExtractor) Configure(facts map[string]interface{}) {
  112. if val, exists := facts[ConfigUASTEndpoint].(string); exists {
  113. exr.Endpoint = val
  114. }
  115. if val, exists := facts[ConfigUASTTimeout].(int); exists {
  116. exr.Context = func() (context.Context, context.CancelFunc) {
  117. return context.WithTimeout(context.Background(),
  118. time.Duration(val)*time.Second)
  119. }
  120. }
  121. if val, exists := facts[ConfigUASTPoolSize].(int); exists {
  122. exr.PoolSize = val
  123. }
  124. if val, exists := facts[ConfigUASTLanguages].(string); exists {
  125. exr.Languages = map[string]bool{}
  126. for _, lang := range strings.Split(val, ",") {
  127. exr.Languages[strings.TrimSpace(lang)] = true
  128. }
  129. }
  130. if val, exists := facts[ConfigUASTFailOnErrors].(bool); exists {
  131. exr.FailOnErrors = val
  132. }
  133. }
  134. func (exr *UASTExtractor) Initialize(repository *git.Repository) {
  135. if exr.Context == nil {
  136. exr.Context = func() (context.Context, context.CancelFunc) {
  137. return context.Background(), nil
  138. }
  139. }
  140. poolSize := exr.PoolSize
  141. if poolSize == 0 {
  142. poolSize = runtime.NumCPU()
  143. }
  144. var err error
  145. exr.clients = make([]*bblfsh.Client, poolSize)
  146. for i := 0; i < poolSize; i++ {
  147. client, err := bblfsh.NewClient(exr.Endpoint)
  148. if err != nil {
  149. panic(err)
  150. }
  151. exr.clients[i] = client
  152. }
  153. if exr.pool != nil {
  154. exr.pool.Close()
  155. }
  156. workers := make([]tunny.TunnyWorker, poolSize)
  157. for i := 0; i < poolSize; i++ {
  158. workers[i] = worker{Client: exr.clients[i], Job: exr.extractTask}
  159. }
  160. exr.pool, err = tunny.CreateCustomPool(workers).Open()
  161. if err != nil {
  162. panic(err)
  163. }
  164. exr.ProcessedFiles = map[string]int{}
  165. if exr.Languages == nil {
  166. exr.Languages = map[string]bool{}
  167. }
  168. }
  169. func (exr *UASTExtractor) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  170. cache := deps["blob_cache"].(map[plumbing.Hash]*object.Blob)
  171. treeDiffs := deps["changes"].(object.Changes)
  172. uasts := map[plumbing.Hash]*uast.Node{}
  173. lock := sync.RWMutex{}
  174. errs := make([]error, 0)
  175. status := make(chan int)
  176. pending := 0
  177. submit := func(change *object.Change) {
  178. {
  179. reader, err := cache[change.To.TreeEntry.Hash].Reader()
  180. if err != nil {
  181. errs = append(errs, err)
  182. return
  183. }
  184. defer ioutil.CheckClose(reader, &err)
  185. buf := new(bytes.Buffer)
  186. if _, err := buf.ReadFrom(reader); err != nil {
  187. errs = append(errs, err)
  188. return
  189. }
  190. lang := enry.GetLanguage(change.To.Name, buf.Bytes())
  191. if _, exists := exr.Languages[lang]; !exists {
  192. exr.ProcessedFiles[change.To.Name] = UAST_EXTRACTION_SKIPPED
  193. return
  194. }
  195. exr.ProcessedFiles[change.To.Name]++
  196. }
  197. pending++
  198. exr.pool.SendWorkAsync(uastTask{
  199. Lock: &lock,
  200. Dest: uasts,
  201. File: &object.File{Name: change.To.Name, Blob: *cache[change.To.TreeEntry.Hash]},
  202. Errors: &errs, Status: status}, nil)
  203. }
  204. for _, change := range treeDiffs {
  205. action, err := change.Action()
  206. if err != nil {
  207. return nil, err
  208. }
  209. switch action {
  210. case merkletrie.Insert:
  211. submit(change)
  212. case merkletrie.Delete:
  213. continue
  214. case merkletrie.Modify:
  215. submit(change)
  216. }
  217. }
  218. for i := 0; i < pending; i++ {
  219. _ = <-status
  220. }
  221. if len(errs) > 0 {
  222. msgs := make([]string, len(errs))
  223. for i, err := range errs {
  224. msgs[i] = err.Error()
  225. }
  226. joined := strings.Join(msgs, "\n")
  227. if exr.FailOnErrors {
  228. return nil, errors.New(joined)
  229. } else {
  230. fmt.Fprintln(os.Stderr, joined)
  231. }
  232. }
  233. return map[string]interface{}{"uasts": uasts}, nil
  234. }
  235. func (exr *UASTExtractor) extractUAST(
  236. client *bblfsh.Client, file *object.File) (*uast.Node, error) {
  237. request := client.NewParseRequest()
  238. contents, err := file.Contents()
  239. if err != nil {
  240. return nil, err
  241. }
  242. request.Content(contents)
  243. request.Filename(file.Name)
  244. ctx, cancel := exr.Context()
  245. if cancel != nil {
  246. defer cancel()
  247. }
  248. response, err := request.DoWithContext(ctx)
  249. if err != nil {
  250. if strings.Contains("missing driver", err.Error()) {
  251. return nil, nil
  252. }
  253. return nil, err
  254. }
  255. if response.Status != protocol.Ok {
  256. return nil, errors.New(strings.Join(response.Errors, "\n"))
  257. }
  258. if err != nil {
  259. return nil, err
  260. }
  261. return response.UAST, nil
  262. }
  263. func (exr *UASTExtractor) extractTask(data interface{}) interface{} {
  264. task := data.(uastTask)
  265. defer func() { task.Status <- 0 }()
  266. node, err := exr.extractUAST(task.Client, task.File)
  267. task.Lock.Lock()
  268. defer task.Lock.Unlock()
  269. if err != nil {
  270. *task.Errors = append(*task.Errors, errors.New(task.File.Name+": "+err.Error()))
  271. return nil
  272. }
  273. if node != nil {
  274. task.Dest[task.File.Hash] = node
  275. }
  276. return nil
  277. }
  278. type UASTChange struct {
  279. Before *uast.Node
  280. After *uast.Node
  281. Change *object.Change
  282. }
  283. type UASTChanges struct {
  284. cache map[plumbing.Hash]*uast.Node
  285. }
  286. func (uc *UASTChanges) Name() string {
  287. return "UASTChanges"
  288. }
  289. func (uc *UASTChanges) Provides() []string {
  290. arr := [...]string{"changed_uasts"}
  291. return arr[:]
  292. }
  293. func (uc *UASTChanges) Requires() []string {
  294. arr := [...]string{"uasts", "changes"}
  295. return arr[:]
  296. }
  297. func (uc *UASTChanges) Features() []string {
  298. arr := [...]string{"uast"}
  299. return arr[:]
  300. }
  301. func (uc *UASTChanges) ListConfigurationOptions() []ConfigurationOption {
  302. return []ConfigurationOption{}
  303. }
  304. func (uc *UASTChanges) Configure(facts map[string]interface{}) {}
  305. func (uc *UASTChanges) Initialize(repository *git.Repository) {
  306. uc.cache = map[plumbing.Hash]*uast.Node{}
  307. }
  308. func (uc *UASTChanges) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  309. uasts := deps["uasts"].(map[plumbing.Hash]*uast.Node)
  310. treeDiffs := deps["changes"].(object.Changes)
  311. commit := make([]UASTChange, 0, len(treeDiffs))
  312. for _, change := range treeDiffs {
  313. action, err := change.Action()
  314. if err != nil {
  315. return nil, err
  316. }
  317. switch action {
  318. case merkletrie.Insert:
  319. hashTo := change.To.TreeEntry.Hash
  320. uastTo := uasts[hashTo]
  321. commit = append(commit, UASTChange{Before: nil, After: uastTo, Change: change})
  322. uc.cache[hashTo] = uastTo
  323. case merkletrie.Delete:
  324. hashFrom := change.From.TreeEntry.Hash
  325. commit = append(commit, UASTChange{Before: uc.cache[hashFrom], After: nil, Change: change})
  326. delete(uc.cache, hashFrom)
  327. case merkletrie.Modify:
  328. hashFrom := change.From.TreeEntry.Hash
  329. hashTo := change.To.TreeEntry.Hash
  330. uastTo := uasts[hashTo]
  331. commit = append(commit, UASTChange{Before: uc.cache[hashFrom], After: uastTo, Change: change})
  332. delete(uc.cache, hashFrom)
  333. uc.cache[hashTo] = uastTo
  334. }
  335. }
  336. return map[string]interface{}{"changed_uasts": commit}, nil
  337. }
  338. type UASTChangesSaver struct {
  339. // OutputPath points to the target directory with UASTs
  340. OutputPath string
  341. repository *git.Repository
  342. result [][]UASTChange
  343. }
  344. const (
  345. ConfigUASTChangesSaverOutputPath = "UASTChangesSaver.OutputPath"
  346. )
  347. func (saver *UASTChangesSaver) Name() string {
  348. return "UASTChangesSaver"
  349. }
  350. func (saver *UASTChangesSaver) Provides() []string {
  351. return []string{}
  352. }
  353. func (saver *UASTChangesSaver) Requires() []string {
  354. arr := [...]string{"changed_uasts"}
  355. return arr[:]
  356. }
  357. func (saver *UASTChangesSaver) Features() []string {
  358. arr := [...]string{"uast"}
  359. return arr[:]
  360. }
  361. func (saver *UASTChangesSaver) ListConfigurationOptions() []ConfigurationOption {
  362. options := [...]ConfigurationOption{{
  363. Name: ConfigUASTChangesSaverOutputPath,
  364. Description: "The target directory where to store the changed UAST files.",
  365. Flag: "changed-uast-dir",
  366. Type: StringConfigurationOption,
  367. Default: "."},
  368. }
  369. return options[:]
  370. }
  371. func (saver *UASTChangesSaver) Flag() string {
  372. return "dump-uast-changes"
  373. }
  374. func (saver *UASTChangesSaver) Configure(facts map[string]interface{}) {
  375. if val, exists := facts[ConfigUASTChangesSaverOutputPath]; exists {
  376. saver.OutputPath = val.(string)
  377. }
  378. }
  379. func (saver *UASTChangesSaver) Initialize(repository *git.Repository) {
  380. saver.repository = repository
  381. saver.result = [][]UASTChange{}
  382. }
  383. func (saver *UASTChangesSaver) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  384. changes := deps["changed_uasts"].([]UASTChange)
  385. saver.result = append(saver.result, changes)
  386. return nil, nil
  387. }
  388. func (saver *UASTChangesSaver) Finalize() interface{} {
  389. return saver.result
  390. }
  391. func (saver *UASTChangesSaver) Serialize(result interface{}, binary bool, writer io.Writer) error {
  392. saverResult := result.([][]UASTChange)
  393. fileNames := saver.dumpFiles(saverResult)
  394. if binary {
  395. return saver.serializeBinary(fileNames, writer)
  396. }
  397. saver.serializeText(fileNames, writer)
  398. return nil
  399. }
  400. func (saver *UASTChangesSaver) dumpFiles(result [][]UASTChange) []*pb.UASTChange {
  401. fileNames := []*pb.UASTChange{}
  402. for i, changes := range result {
  403. for j, change := range changes {
  404. if change.Before == nil || change.After == nil {
  405. continue
  406. }
  407. record := &pb.UASTChange{FileName: change.Change.To.Name}
  408. bs, _ := change.Before.Marshal()
  409. record.UastBefore = path.Join(saver.OutputPath, fmt.Sprintf(
  410. "%d_%d_before_%s.pb", i, j, change.Change.From.TreeEntry.Hash.String()))
  411. goioutil.WriteFile(record.UastBefore, bs, 0666)
  412. blob, _ := saver.repository.BlobObject(change.Change.From.TreeEntry.Hash)
  413. s, _ := (&object.File{Blob: *blob}).Contents()
  414. record.SrcBefore = path.Join(saver.OutputPath, fmt.Sprintf(
  415. "%d_%d_before_%s.src", i, j, change.Change.From.TreeEntry.Hash.String()))
  416. goioutil.WriteFile(record.SrcBefore, []byte(s), 0666)
  417. bs, _ = change.After.Marshal()
  418. record.UastAfter = path.Join(saver.OutputPath, fmt.Sprintf(
  419. "%d_%d_after_%s.pb", i, j, change.Change.To.TreeEntry.Hash.String()))
  420. goioutil.WriteFile(record.UastAfter, bs, 0666)
  421. blob, _ = saver.repository.BlobObject(change.Change.To.TreeEntry.Hash)
  422. s, _ = (&object.File{Blob: *blob}).Contents()
  423. record.SrcAfter = path.Join(saver.OutputPath, fmt.Sprintf(
  424. "%d_%d_after_%s.src", i, j, change.Change.To.TreeEntry.Hash.String()))
  425. goioutil.WriteFile(record.SrcAfter, []byte(s), 0666)
  426. fileNames = append(fileNames, record)
  427. }
  428. }
  429. return fileNames
  430. }
  431. func (saver *UASTChangesSaver) serializeText(result []*pb.UASTChange, writer io.Writer) {
  432. for _, sc := range result {
  433. kv := [...]string{
  434. "file: " + sc.FileName,
  435. "src0: " + sc.SrcBefore, "src1: " + sc.SrcAfter,
  436. "uast0: " + sc.UastBefore, "uast1: " + sc.UastAfter,
  437. }
  438. fmt.Fprintf(writer, " - {%s}\n", strings.Join(kv[:], ", "))
  439. }
  440. }
  441. func (saver *UASTChangesSaver) serializeBinary(result []*pb.UASTChange, writer io.Writer) error {
  442. message := pb.UASTChangesSaverResults{Changes: result}
  443. serialized, err := proto.Marshal(&message)
  444. if err != nil {
  445. return err
  446. }
  447. writer.Write(serialized)
  448. return nil
  449. }
  450. func init() {
  451. Registry.Register(&UASTExtractor{})
  452. Registry.Register(&UASTChanges{})
  453. Registry.Register(&UASTChangesSaver{})
  454. }